From 762149a6e233e1377945da4949f127a664599ef4 Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Thu, 21 Apr 2022 11:35:52 +0200
Subject: [PATCH 1/9] feat: added files to encourage PEP8

---
 Makefile             | 41 +++++++++++++++++++++++++++++++++++++++++
 requirements.dev.txt |  6 ++++++
 2 files changed, 47 insertions(+)
 create mode 100644 Makefile
 create mode 100644 requirements.dev.txt

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..b31a1db
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,41 @@
+# Makefile with some simple commands to make developer's life easier
+
+
+install-requirements: install-build-essential
+	pip install -r requirements.txt
+
+dev/install-requirements: install-requirements
+	pip install -r requirements.dev.txt
+
+install-build-essential:
+	sudo apt-get update
+	sudo apt-get install build-essential
+
+update-setuptools:
+	pip install --upgrade setuptools wheel
+
+test-unit:
+	pytest tests
+	@echo 'unit tests OK'
+
+lint:
+	pylint cobra
+	@echo 'lint OK'
+
+lint-minimal:
+	pylint E cobra
+	@echo 'lint minimal OK'
+
+typecheck:
+	mypy cobra
+	@echo 'typecheck OK'
+
+codestyle:
+	pycodestyle cobra
+	@echo 'codestyle OK'
+
+docstyle:
+	pydocstyle cobra
+	@echo 'docstyle OK'
+
+code-qa: typecheck codestyle docstyle lint-minimal
diff --git a/requirements.dev.txt b/requirements.dev.txt
new file mode 100644
index 0000000..3d87710
--- /dev/null
+++ b/requirements.dev.txt
@@ -0,0 +1,6 @@
+mypy>=0.942
+pycodestyle>=2.8.0
+pydocstyle>=6.1.1
+pylint>=2.13.7
+pytest>=7.1.1
+pytest-mock>=3.7.0
\ No newline at end of file

From a488879afee4c283895dc6a105996810763586af Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Thu, 21 Apr 2022 11:36:50 +0200
Subject: [PATCH 2/9] fix: pydocstyle

Except for target_encoder _fit_column, transform and _transform_column.
---
 cobra/__init__.py                             |  2 +
 cobra/evaluation/__init__.py                  |  2 +
 cobra/evaluation/evaluator.py                 | 59 +++++-------
 cobra/evaluation/pigs_tables.py               |  5 +-
 cobra/evaluation/plotting_utils.py            |  6 +-
 cobra/model_building/__init__.py              |  2 +
 cobra/model_building/forward_selection.py     | 73 ++++++++-------
 cobra/model_building/models.py                | 48 +++++-----
 cobra/model_building/univariate_selection.py  | 32 +++----
 cobra/preprocessing/__init__.py               |  2 +
 .../categorical_data_processor.py             | 60 +++++++------
 cobra/preprocessing/kbins_discretizer.py      | 79 ++++++++--------
 cobra/preprocessing/preprocessor.py           | 90 ++++++++++---------
 cobra/preprocessing/target_encoder.py         | 24 +++--
 cobra/utils.py                                |  9 +-
 15 files changed, 268 insertions(+), 225 deletions(-)

diff --git a/cobra/__init__.py b/cobra/__init__.py
index 7152555..8afad45 100644
--- a/cobra/__init__.py
+++ b/cobra/__init__.py
@@ -1 +1,3 @@
+"""Cobra module."""
+
 from .version import __version__
\ No newline at end of file
diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py
index 1f8f487..d480bdb 100644
--- a/cobra/evaluation/__init__.py
+++ b/cobra/evaluation/__init__.py
@@ -1,3 +1,5 @@
+"""The evaluation module includes utils and plots to evaluate a created model."""
+
 from .pigs_tables import generate_pig_tables
 from .pigs_tables import compute_pig_table
 from .pigs_tables import plot_incidence
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index 5a530dc..f550431 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -1,3 +1,4 @@
+"""Evaluate the created model."""
 
 import numpy as np
 import pandas as pd
@@ -26,8 +27,7 @@
 from sklearn.metrics import r2_score
 
 class ClassificationEvaluator():
-    """Evaluator class encapsulating classification model metrics
-    and plotting functionality.
+    """Evaluator class encapsulating classification model metrics and plotting functionality.
 
     Attributes
     ----------
@@ -56,11 +56,13 @@ class ClassificationEvaluator():
         (by default 10, so deciles).
     """
 
-    def __init__(self,
-                 probability_cutoff: float=None,
-                 lift_at: float=0.05,
-                 n_bins: int = 10):
-
+    def __init__(
+        self,
+        probability_cutoff: float=None,
+        lift_at: float=0.05,
+        n_bins: int = 10
+    ):
+        """Initialize the ClassificationEvaluator."""
         self.y_true = None
         self.y_pred = None
 
@@ -76,8 +78,7 @@ def __init__(self,
         self.cumulative_gains = None
 
     def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
-        """Fit the evaluator by computing the relevant evaluation metrics on
-        the inputs.
+        """Fit the evaluator by computing the relevant evaluation metrics on the inputs.
 
         Parameters
         ----------
@@ -119,8 +120,7 @@ def _compute_scalar_metrics(y_true: np.ndarray,
                                 y_pred: np.ndarray,
                                 y_pred_b: np.ndarray,
                                 lift_at: float) -> pd.Series:
-        """Convenient function to compute various scalar performance measures
-        and return them in a pd.Series.
+        """Compute various scalar performance measures.
 
         Parameters
         ----------
@@ -168,7 +168,6 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
         dim : tuple, optional
             Tuple with width and length of the plot.
         """
-
         if self.roc_curve is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
@@ -211,7 +210,6 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
         labels : list, optional
             Optional list of labels, default "0" and "1".
         """
-
         if self.confusion_matrix is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
@@ -240,7 +238,6 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
         dim : tuple, optional
             Tuple with width and length of the plot.
         """
-
         if self.lift_curve is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
@@ -291,7 +288,6 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
         dim : tuple, optional
             Tuple with width and length of the plot.
         """
-
         if self.lift_curve is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
@@ -340,7 +336,6 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
         dim : tuple, optional
             Tuple with width and length of the plot.
         """
-
         with plt.style.context("seaborn-whitegrid"):
             fig, ax = plt.subplots(figsize=dim)
 
@@ -375,8 +370,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
     @staticmethod
     def _find_optimal_cutoff(y_true: np.ndarray,
                              y_pred: np.ndarray) -> float:
-        """Find the optimal probability cut off point for a
-        classification model. Wrapper around _compute_optimal_cutoff.
+        """Find the optimal probability cut off point for a classification model.
 
         Parameters
         ----------
@@ -396,8 +390,7 @@ def _find_optimal_cutoff(y_true: np.ndarray,
     @staticmethod
     def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray,
                                 thresholds: np.ndarray) -> float:
-        """Find the optimal probability cut-off point for a
-        classification model.
+        """Calculate the optimal probability cut-off point for a classification model.
 
         The optimal cut-off would be where TPR is high and FPR is low, hence
         TPR - (1-FPR) should be zero or close to zero for the optimal cut-off.
@@ -426,8 +419,7 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray,
     @staticmethod
     def _compute_cumulative_gains(y_true: np.ndarray,
                                   y_pred: np.ndarray) -> tuple:
-        """Compute cumulative gains of the model, returns percentages and
-        gains cumulative gains curves.
+        """Compute cumulative gains of the model.
 
         Code from (https://github.com/reiinakano/scikit-plot/blob/
                    2dd3e6a76df77edcbd724c4db25575f70abb57cb/
@@ -445,7 +437,6 @@ def _compute_cumulative_gains(y_true: np.ndarray,
         tuple
             With x-labels, and gains.
         """
-
         # make y_true a boolean vector
         y_true = (y_true == 1)
 
@@ -467,8 +458,7 @@ def _compute_cumulative_gains(y_true: np.ndarray,
     def _compute_lift_per_bin(y_true: np.ndarray,
                               y_pred: np.ndarray,
                               n_bins: int=10) -> tuple:
-        """Compute lift of the model for a given number of bins, returns x-labels,
-        lifts and the target incidence to create cumulative response curves.
+        """Compute lift of the model for a given number of bins.
 
         Parameters
         ----------
@@ -485,7 +475,6 @@ def _compute_lift_per_bin(y_true: np.ndarray,
         tuple
             Includes x-labels, lifts per decile, and target incidence.
         """
-
         lifts = [ClassificationEvaluator._compute_lift(y_true=y_true,
                                                        y_pred=y_pred,
                                                        lift_at=perc_lift)
@@ -498,7 +487,7 @@ def _compute_lift_per_bin(y_true: np.ndarray,
     @staticmethod
     def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray,
                       lift_at: float=0.05) -> float:
-        """Calculates lift given two arrays on specified level.
+        """Calculate lift on a specified level.
 
         Parameters
         ----------
@@ -514,7 +503,6 @@ def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray,
         float
             Lift of the model.
         """
-
         # Make sure it is numpy array
         y_true_ = np.array(y_true)
         y_pred_ = np.array(y_pred)
@@ -544,8 +532,7 @@ def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray,
 
 
 class RegressionEvaluator():
-    """Evaluator class encapsulating regression model metrics
-    and plotting functionality.
+    """Evaluator class encapsulating regression model metrics and plotting functionality.
 
     Attributes
     ----------
@@ -560,7 +547,7 @@ class RegressionEvaluator():
     """
 
     def __init__(self):
-
+        """Initialize the RegressionEvaluator."""
         self.y_true = None
         self.y_pred = None
 
@@ -569,8 +556,7 @@ def __init__(self):
         self.qq = None
 
     def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
-        """Fit the evaluator by computing the relevant evaluation metrics on
-        the inputs.
+        """Fit the evaluator by computing the relevant evaluation metrics on the inputs.
 
         Parameters
         ----------
@@ -591,8 +577,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
     @staticmethod
     def _compute_scalar_metrics(y_true: np.ndarray,
                                 y_pred: np.ndarray) -> pd.Series:
-        """Convenient function to compute various scalar performance measures
-        and return them in a pd.Series.
+        """Compute various scalar performance measures.
 
         Parameters
         ----------
@@ -620,8 +605,7 @@ def _compute_scalar_metrics(y_true: np.ndarray,
     @staticmethod
     def _compute_qq_residuals(y_true: np.ndarray,
                               y_pred: np.ndarray) -> pd.Series:
-        """Convenience function to compute various scalar performance measures
-        and return them in a pd.Series.
+        """Compute various scalar performance measures.
 
         Parameters
         ----------
@@ -698,7 +682,6 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
         dim : tuple, optional
             Tuple with width and length of the plot.
         """
-
         if self.qq is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
index 7f03b42..8915c5e 100644
--- a/cobra/evaluation/pigs_tables.py
+++ b/cobra/evaluation/pigs_tables.py
@@ -1,3 +1,4 @@
+"""Create Predictor Insight Graph tables."""
 
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -98,7 +99,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
                    model_type: str,
                    column_order: list=None,
                    dim: tuple=(12, 8)):
-    """Plots a Predictor Insights Graph (PIG), a graph in which the mean
+    """Plot a Predictor Insights Graph (PIG).
+    
+    A PIG is a graph in which the mean
     target value is plotted for a number of bins constructed from a predictor
     variable. When the target is a binary classification target,
     the plotted mean target value is a true incidence rate.
diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py
index 7683f24..5aaf1a2 100644
--- a/cobra/evaluation/plotting_utils.py
+++ b/cobra/evaluation/plotting_utils.py
@@ -1,3 +1,4 @@
+"""Collection of plotting utils."""
 
 # third party imports
 import numpy as np
@@ -22,7 +23,6 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame,
     path : str, optional
         Path to store the figure.
     """
-
     if "AUC selection" in df_metric.columns:
         metric = "AUC"
         ascending = False
@@ -86,8 +86,7 @@ def plot_performance_curves(model_performance: pd.DataFrame,
                                           "selection": "#ff9500",
                                           "validation": "#8064a2"},
                             metric_name: str=None):
-    """Plot performance curves generated by the forward feature selection
-    for the train-selection-validation sets.
+    """Plot performance curves for the train-selection-validation sets.
 
     Parameters
     ----------
@@ -105,7 +104,6 @@ def plot_performance_curves(model_performance: pd.DataFrame,
         Defaults to RMSE in case of regression and AUC in case of
         classification.
     """
-
     model_type = model_performance["model_type"][0]
 
     if metric_name is None:
diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py
index 7a646c3..288a2c4 100644
--- a/cobra/model_building/__init__.py
+++ b/cobra/model_building/__init__.py
@@ -1,3 +1,5 @@
+"""This module includes utils to calculate the best features."""
+
 from .univariate_selection import compute_univariate_preselection
 from .univariate_selection import get_preselected_predictors
 from .univariate_selection import compute_correlations
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index 29e06b3..693fed3 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -1,3 +1,4 @@
+"""Feature forward selection."""
 
 import logging
 from typing import Callable, Optional
@@ -10,8 +11,7 @@
 log = logging.getLogger(__name__)
 
 class ForwardFeatureSelection:
-    """Perform forward feature selection for a given dataset using a given
-    algorithm.
+    """Perform forward feature selection for a given dataset using a given algorithm.
 
     Predictors are sequentially added to the model, starting with the one that
     has the highest univariate predictive power, and then proceeding with those that
@@ -35,11 +35,13 @@ class ForwardFeatureSelection:
         List of fitted models.
     """
 
-    def __init__(self,
-                 model_type: str="classification",
-                 max_predictors: int=50,
-                 pos_only: bool=True):
-
+    def __init__(
+        self,
+        model_type: str="classification",
+        max_predictors: int=50,
+        pos_only: bool=True
+    ):
+        """Initialize the ForwardFeatureSelection class."""
         self.model_type = model_type
         if model_type == "classification":
             self.MLModel = LogisticRegressionModel
@@ -75,14 +77,17 @@ def get_model_from_step(self, step: int):
 
         return self._fitted_models[step]
 
-    def compute_model_performances(self, data: pd.DataFrame,
-                                   target_column_name: str,
-                                   splits: list=["train", "selection", "validation"],
-                                   metric: Optional[Callable]=None,
-                                   ) -> pd.DataFrame:
-        """Compute for each model the performance for different sets (e.g.
-        train-selection-validation) and return them along with a list of
-        predictors used in the model. Note that the computation of the
+    def compute_model_performances(
+        self, data: pd.DataFrame,
+        target_column_name: str,
+        splits: list=["train", "selection", "validation"],
+        metric: Optional[Callable]=None,
+    ) -> pd.DataFrame:
+        """
+        Compute for each model the performance for different sets.
+        
+        Different sets could be cross validation, train-selection-validation, ...
+        Note that the computation of the
         performance for each split is cached inside the model itself, so it
         is inexpensive to perform it multiple times!
 
@@ -168,7 +173,6 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
             In case the number of forced predictors is larger than the maximum
             number of allowed predictors in the model.
         """
-
         assert "split" in train_data.columns, "The train_data input df does not include a split column."
         assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
             "The train_data input df does not include a 'train' and 'selection' split."
@@ -196,14 +200,18 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
                                                           filtered_predictors,
                                                           forced_predictors)
 
-    def _forward_selection(self,
-                           train_data: pd.DataFrame,
-                           target_column_name: str,
-                           predictors: list,
-                           forced_predictors: list = []) -> list:
-        """Perform the forward feature selection algorithm to compute a list
-        of models (with increasing performance). The length of the list,
-        i.e. the number of models, is bounded by the max_predictors class
+    def _forward_selection(
+        self,
+        train_data: pd.DataFrame,
+        target_column_name: str,
+        predictors: list,
+        forced_predictors: list = []
+    ) -> list:
+        """Perform the forward feature selection algorithm.
+        
+        The algorithm will compute a list of models (with increasing performance).
+        The length of the list, i.e. the number of models,
+        is bounded by the max_predictors class
         attribute.
 
         Parameters
@@ -262,12 +270,17 @@ def _forward_selection(self,
 
         return fitted_models
 
-    def _find_next_best_model(self,
-                              train_data: pd.DataFrame,
-                              target_column_name: str,
-                              candidate_predictors: list,
-                              current_predictors: list):
-        """Given a list of current predictors which are already selected to
+    def _find_next_best_model(
+        self,
+        train_data: pd.DataFrame,
+        target_column_name: str,
+        candidate_predictors: list,
+        current_predictors: list
+    ):
+        """
+        Find the next best model with candidate predictors.
+        
+        Given a list of current predictors which are already selected to
         be include in the model, find amongst a list candidate predictors
         the predictor to add to the selected list so that the resulting model
         has the best performance.
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 233162c..7c55acf 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -1,3 +1,4 @@
+"""Contains all types of models supported by Cobra."""
 
 from typing import Callable, Optional
 
@@ -15,7 +16,10 @@
 from cobra.evaluation import ClassificationEvaluator
 
 class LogisticRegressionModel:
-    """Wrapper around the LogisticRegression class, with additional methods
+    """
+    Cobra's LogisticRegression model.
+
+    Wrapper around the LogisticRegression class, with additional methods
     implemented such as evaluation (using AUC), getting a list of coefficients,
     a dictionary of coefficients per predictor, ... for convenience.
 
@@ -28,6 +32,7 @@ class LogisticRegressionModel:
     """
 
     def __init__(self):
+        """Initialize the LogisticRegressionModel class."""
         self.logit = LogisticRegression(fit_intercept=True, C=1e9,
                                         solver='liblinear', random_state=42)
         self._is_fitted = False
@@ -73,7 +78,6 @@ def deserialize(self, model_dict: dict):
         ValueError
             In case JSON file is no valid serialized model.
         """
-
         if not self._is_valid_dict(model_dict):
             raise ValueError("No valid serialized model")
 
@@ -87,7 +91,7 @@ def deserialize(self, model_dict: dict):
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
     def get_coef(self) -> np.array:
-        """Returns the model coefficients.
+        """Return the model coefficients.
 
         Returns
         -------
@@ -97,7 +101,7 @@ def get_coef(self) -> np.array:
         return self.logit.coef_[0]
 
     def get_intercept(self) -> float:
-        """Returns the intercept of the model.
+        """Return the intercept of the model.
 
         Returns
         -------
@@ -107,7 +111,7 @@ def get_intercept(self) -> float:
         return self.logit.intercept_[0]
 
     def get_coef_by_predictor(self) -> dict:
-        """Returns a dictionary mapping predictor (key) to coefficient (value).
+        """Return a dictionary mapping predictor (key) to coefficient (value).
 
         Returns
         -------
@@ -150,7 +154,10 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
                  split: str=None,
                  metric: Optional[Callable]=None) -> float:
-        """Evaluate the model on a given dataset (X, y). The optional split
+        """
+        Evaluate the model on a given dataset (X, y).
+
+        The optional split
         parameter is to indicate that the dataset belongs to
         (train, selection, validation), so that the computation on these sets
         can be cached!
@@ -198,8 +205,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         return self._eval_metrics_by_split[split]
 
     def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
-        """Compute the importance of each predictor in the model and return
-        it as a DataFrame.
+        """Compute the importance of each predictor in the model.
 
         Parameters
         ----------
@@ -211,7 +217,6 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame
             DataFrame containing columns predictor and importance.
         """
-
         y_pred = self.score_model(data)
 
         importance_by_variable = {
@@ -230,7 +235,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
                 .reset_index(drop=True))
 
     def _is_valid_dict(self, model_dict: dict) -> bool:
-
+        """Check if the model dictionary is valid."""
         if ("meta" not in model_dict
                 or model_dict["meta"] != "logistic-regression"):
             return False
@@ -248,7 +253,10 @@ def _is_valid_dict(self, model_dict: dict) -> bool:
 
 
 class LinearRegressionModel:
-    """Wrapper around the LinearRegression class, with additional methods
+    """
+    Cobra's LinearRegression model.
+
+    Wrapper around the LinearRegression class, with additional methods
     implemented such as evaluation (using RMSE), getting a list of coefficients,
     a dictionary of coefficients per predictor, ... for convenience.
 
@@ -261,6 +269,7 @@ class LinearRegressionModel:
     """
 
     def __init__(self):
+        """Initialize the LinearRegression class."""
         self.linear = LinearRegression(fit_intercept=True, normalize=False)
         self._is_fitted = False
         # placeholder to keep track of a list of predictors
@@ -303,7 +312,6 @@ def deserialize(self, model_dict: dict):
         ValueError
             In case JSON file is no valid serialized model.
         """
-
         if not self._is_valid_dict(model_dict):
             raise ValueError("No valid serialized model")
 
@@ -315,7 +323,7 @@ def deserialize(self, model_dict: dict):
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
     def get_coef(self) -> np.array:
-        """Returns the model coefficients.
+        """Return the model coefficients.
 
         Returns
         -------
@@ -325,7 +333,7 @@ def get_coef(self) -> np.array:
         return self.linear.coef_
 
     def get_intercept(self) -> float:
-        """Returns the intercept of the model.
+        """Return the intercept of the model.
 
         Returns
         -------
@@ -335,7 +343,7 @@ def get_intercept(self) -> float:
         return self.linear.intercept_[0]
 
     def get_coef_by_predictor(self) -> dict:
-        """Returns a dictionary mapping predictor (key) to coefficient (value).
+        """Return a dictionary mapping predictor (key) to coefficient (value).
 
         Returns
         -------
@@ -378,7 +386,9 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
                  split: str=None,
                  metric: Optional[Callable]=None) -> float:
-        """Evaluate the model on a given dataset (X, y). The optional split
+        """Evaluate the model on a given dataset (X, y).
+
+        The optional split
         parameter is to indicate that the dataset belongs to
         (train, selection, validation), so that the computation on these sets
         can be cached!
@@ -421,8 +431,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         return self._eval_metrics_by_split[split]
 
     def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
-        """Compute the importance of each predictor in the model and return
-        it as a DataFrame.
+        """Compute the importance of each predictor in the model.
 
         Parameters
         ----------
@@ -434,7 +443,6 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame
             DataFrame containing columns predictor and importance.
         """
-
         y_pred = self.score_model(data)
 
         importance_by_variable = {
@@ -453,7 +461,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
                 .reset_index(drop=True))
 
     def _is_valid_dict(self, model_dict: dict) -> bool:
-
+        """Check if the model dictionary is valid."""
         if ("meta" not in model_dict
                 or model_dict["meta"] != "linear-regression"):
             return False
diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 2db4abb..bb412e3 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -1,20 +1,23 @@
-
+"""Calculate the univariate quality of predictors."""
 import pandas as pd
 from sklearn.metrics import roc_auc_score, mean_squared_error
 from numpy import sqrt
 
 import cobra.utils as utils
 
-def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
-                                    target_enc_selection_data: pd.DataFrame,
-                                    predictors: list,
-                                    target_column: str,
-                                    model_type: str = "classification",
-                                    preselect_auc_threshold: float = 0.053,
-                                    preselect_rmse_threshold: float = 5,
-                                    preselect_overtrain_threshold: float = 0.05
-                                    ) -> pd.DataFrame:
-    """Perform a preselection of predictors based on an AUC (in case of
+def compute_univariate_preselection(
+    target_enc_train_data: pd.DataFrame,
+    target_enc_selection_data: pd.DataFrame,
+    predictors: list,
+    target_column: str,
+    model_type: str = "classification",
+    preselect_auc_threshold: float = 0.053,
+    preselect_rmse_threshold: float = 5,
+    preselect_overtrain_threshold: float = 0.05
+) -> pd.DataFrame:
+    """Perform a preselection of predictors.
+    
+    The preselection is based on an AUC (in case of
     classification) or a RMSE (in case of regression) threshold of
     a univariate model on a train and selection dataset and return a DataFrame
     containing for each variable the train and selection AUC or RMSE along with a
@@ -128,7 +131,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
     return df_out
 
 def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
-    """Wrapper function to extract a list of predictors from df_metric.
+    """Extract a list of predictors from df_metric.
 
     Parameters
     ----------
@@ -142,7 +145,6 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     list
         List of preselected predictors.
     """
-
     if "AUC selection" in df_metric.columns:
         predictor_list = (df_metric[df_metric["preselection"]]
                           .sort_values(by="AUC selection", ascending=False)
@@ -156,8 +158,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
 
 def compute_correlations(target_enc_train_data: pd.DataFrame,
                          predictors: list) -> pd.DataFrame:
-    """Given a DataFrame and a list of predictors, compute the correlations
-    amongst the predictors in the DataFrame.
+    """Compute the correlations amongst the predictors in the DataFrame.
 
     Parameters
     ----------
@@ -172,7 +173,6 @@ def compute_correlations(target_enc_train_data: pd.DataFrame,
     pd.DataFrame
         The correlation matrix of the training set.
     """
-
     correlations = target_enc_train_data[predictors].corr()
 
     predictors_cleaned = [utils.clean_predictor_name(predictor)
diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py
index e02ad4c..b72d1a4 100644
--- a/cobra/preprocessing/__init__.py
+++ b/cobra/preprocessing/__init__.py
@@ -1,3 +1,5 @@
+"""This module contains all preprocessing utils."""
+
 from .kbins_discretizer import KBinsDiscretizer
 from .target_encoder import TargetEncoder
 from .categorical_data_processor import CategoricalDataProcessor
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index 175bfb5..c9e906d 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -1,3 +1,4 @@
+"""Process categorical data."""
 
 # standard lib imports
 import re
@@ -15,8 +16,7 @@
 log = logging.getLogger(__name__)
 
 class CategoricalDataProcessor(BaseEstimator):
-    """Regroups the categories of categorical variables based on significance
-    with target variable.
+    """Regroup categorical variables based on significance with target variable.
 
     This class implements the Python Prediction's way of dealing with
     categorical data preprocessing. There are three steps involved:
@@ -64,16 +64,18 @@ class CategoricalDataProcessor(BaseEstimator):
                   "category_size_threshold", "p_value_threshold",
                   "scale_contingency_table", "forced_categories"]
 
-    def __init__(self,
-                 model_type: str="classification",
-                 regroup: bool=True,
-                 regroup_name: str="Other",
-                 keep_missing: bool=True,
-                 category_size_threshold: int=5,
-                 p_value_threshold: float=0.001,
-                 scale_contingency_table: bool=True,
-                 forced_categories: dict={}):
-        
+    def __init__(
+        self,
+        model_type: str="classification",
+        regroup: bool=True,
+        regroup_name: str="Other",
+        keep_missing: bool=True,
+        category_size_threshold: int=5,
+        p_value_threshold: float=0.001,
+        scale_contingency_table: bool=True,
+        forced_categories: dict={}
+    ):
+        """Initialize the CategoricalDataProcessor."""
         if model_type not in ["classification", "regression"]:
             raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.")
 
@@ -108,8 +110,7 @@ def attributes_to_dict(self) -> dict:
         return params
 
     def set_attributes_from_dict(self, params: dict):
-        """Set instance attributes from a dictionary of values with key the
-        name of the attribute.
+        """Set instance attributes from a dictionary of values with key the name of the attribute.
 
         Parameters
         ----------
@@ -156,7 +157,6 @@ def fit(self, data: pd.DataFrame, column_names: list,
         target_column : str
             Column name of the target.
         """
-
         if not self.regroup:
             # We do not need to fit anything if regroup is set to False!
             log.info("regroup was set to False, so no fitting is required")
@@ -181,7 +181,10 @@ def fit(self, data: pd.DataFrame, column_names: list,
 
     def _fit_column(self, data: pd.DataFrame, column_name: str,
                     target_column) -> set:
-        """Compute which categories to regroup into "Other"
+        """
+        Fit all necessary columns into "Other".
+    
+        Computes which categories to regroup into "Other"
         for a particular column, and return those that need
         to be kept as-is.
 
@@ -271,7 +274,6 @@ def transform(self, data: pd.DataFrame,
         pd.DataFrame
             Data with additional transformed variables.
         """
-
         if self.regroup and len(self._cleaned_categories_by_column) == 0:
             msg = ("{} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
@@ -291,9 +293,7 @@ def transform(self, data: pd.DataFrame,
 
     def _transform_column(self, data: pd.DataFrame,
                           column_name: str) -> pd.DataFrame:
-        """Given a DataFrame, a column name and a list of categories to
-        combine, create an additional column which combines these categories
-        into "Other".
+        """Create an additional column which combines categories into "Other".
 
         Parameters
         ----------
@@ -307,7 +307,6 @@ def _transform_column(self, data: pd.DataFrame,
         pd.DataFrame
             Original DataFrame with an added processed column.
         """
-
         column_name_clean = column_name + "_processed"
         data.loc[:, column_name_clean] = data[column_name].astype(object)
 
@@ -343,7 +342,7 @@ def _transform_column(self, data: pd.DataFrame,
 
     def fit_transform(self, data: pd.DataFrame, column_names: list,
                       target_column: str) -> pd.DataFrame:
-        """Fits the data, then transforms it.
+        """Fit and transform the data.
 
         Parameters
         ----------
@@ -360,7 +359,6 @@ def fit_transform(self, data: pd.DataFrame, column_names: list,
         pd.DataFrame
             Data with additional transformed variables.
         """
-
         self.fit(data, column_names, target_column)
         return self.transform(data, column_names)
 
@@ -368,7 +366,9 @@ def fit_transform(self, data: pd.DataFrame, column_names: list,
     def _get_small_categories(predictor_series: pd.Series,
                               incidence: float,
                               category_size_threshold: int) -> set:
-        """Fetch categories with a size below a certain threshold.
+        """
+        Fetch categories with a size below a certain threshold.
+
         Note that we use an additional weighting with the overall incidence.
 
         Parameters
@@ -430,7 +430,10 @@ def _replace_missings(data: pd.DataFrame,
     def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
                          model_type: str,
                          scale_contingency_table: bool) -> float:
-        """Calculates p-value in order to evaluate whether category of
+        """
+        Calculate p-value.
+        
+        Calculate p-value in order to evaluate whether category of
         interest is significantly different from the rest of the
         categories, given the target variable.
 
@@ -484,8 +487,11 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
     @staticmethod
     def _replace_categories(data: pd.Series, categories: set,
                             replace_with: str) -> pd.Series:
-        """Replace categories in set with "Other" and transform the remaining
-        categories to strings to avoid type errors later on in the pipeline.
+        """
+        Replace categories in set with "Other".
+
+        Transforms the remaining categories to strings
+        to avoid type errors later on in the pipeline.
 
         Parameters
         ----------
diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py
index c30d7de..3fe611a 100644
--- a/cobra/preprocessing/kbins_discretizer.py
+++ b/cobra/preprocessing/kbins_discretizer.py
@@ -1,4 +1,4 @@
-
+"""Binning of continous data."""
 # standard lib imports
 from copy import deepcopy
 from typing import List
@@ -16,7 +16,10 @@
 log = logging.getLogger(__name__)
 
 class KBinsDiscretizer(BaseEstimator):
-    """Bin continuous data into intervals of predefined size. It provides a
+    """
+    Discretize continuous values into categorical values.
+
+    Bin continuous data into intervals of predefined size. It provides a
     way to partition continuous data into discrete values, i.e. transform
     continuous data into nominal data. This can make a linear model more
     expressive as it introduces nonlinearity to the model, while maintaining
@@ -63,13 +66,15 @@ class KBinsDiscretizer(BaseEstimator):
                   "starting_precision", "label_format",
                   "change_endpoint_format"]
 
-    def __init__(self, n_bins: int = 10, strategy: str = "quantile",
-                 closed: str = "right",
-                 auto_adapt_bins: bool = False,
-                 starting_precision: int = 0,
-                 label_format: str = "{} - {}",
-                 change_endpoint_format: bool = False):
-
+    def __init__(
+        self, n_bins: int = 10, strategy: str = "quantile",
+        closed: str = "right",
+        auto_adapt_bins: bool = False,
+        starting_precision: int = 0,
+        label_format: str = "{} - {}",
+        change_endpoint_format: bool = False
+    ):
+        """Initialize the KBinsDiscretizer."""
         # validate number of bins
         self._validate_n_bins(n_bins)
 
@@ -85,8 +90,7 @@ def __init__(self, n_bins: int = 10, strategy: str = "quantile",
         self._bins_by_column = {}
 
     def _validate_n_bins(self, n_bins: int):
-        """Check if ``n_bins`` is of the proper type and if it is bigger
-        than two
+        """Check if ``n_bins`` is of the proper type and if it is bigger than one.
 
         Parameters
         ----------
@@ -109,7 +113,7 @@ def _validate_n_bins(self, n_bins: int):
                              .format(KBinsDiscretizer.__name__, n_bins))
 
     def attributes_to_dict(self) -> dict:
-        """Return the attributes of KBinsDiscretizer in a dictionary
+        """Return the attributes of KBinsDiscretizer as a dictionary.
 
         Returns
         -------
@@ -127,8 +131,7 @@ def attributes_to_dict(self) -> dict:
         return params
 
     def set_attributes_from_dict(self, params: dict):
-        """Set instance attributes from a dictionary of values with key the
-        name of the attribute.
+        """Set instance attributes from a dictionary.
 
         Parameters
         ----------
@@ -163,7 +166,7 @@ def set_attributes_from_dict(self, params: dict):
         return self
 
     def fit(self, data: pd.DataFrame, column_names: list):
-        """Fits the estimator
+        """Fit the estimator.
 
         Parameters
         ----------
@@ -172,7 +175,6 @@ def fit(self, data: pd.DataFrame, column_names: list):
         column_names : list
             Names of the columns of the DataFrame to discretize
         """
-
         if self.strategy not in self.valid_strategies:
             raise ValueError("{}: valid options for 'strategy' are {}. "
                              "Got strategy={!r} instead."
@@ -194,7 +196,7 @@ def fit(self, data: pd.DataFrame, column_names: list):
 
     def _fit_column(self, data: pd.DataFrame,
                     column_name: str) -> List[tuple]:
-        """Compute bins for a specific column in data
+        """Compute bins for a specific column in data.
 
         Parameters
         ----------
@@ -254,8 +256,10 @@ def _fit_column(self, data: pd.DataFrame,
 
     def transform(self, data: pd.DataFrame,
                   column_names: list) -> pd.DataFrame:
-        """Discretizes the data in the given list of columns by mapping each
-        number to the appropriate bin computed by the fit method
+        """Discretize the data in the given list of columns.
+
+        This is done by mapping each number to
+        the appropriate bin computed by the fit method.
 
         Parameters
         ----------
@@ -291,9 +295,7 @@ def transform(self, data: pd.DataFrame,
     def _transform_column(self, data: pd.DataFrame,
                           column_name: str,
                           bins: List[tuple]) -> pd.DataFrame:
-        """Given a DataFrame, a column name and a list of bins,
-        create an additional column which determines the bin in which the value
-        of column_name lies in.
+        """Create a new column with binned values of column_name.
 
         Parameters
         ----------
@@ -309,7 +311,6 @@ def _transform_column(self, data: pd.DataFrame,
         pd.DataFrame
             original DataFrame with an added binned column
         """
-
         interval_idx = KBinsDiscretizer._create_index(bins, self.closed)
 
         column_name_bin = column_name + "_bin"
@@ -337,7 +338,7 @@ def _transform_column(self, data: pd.DataFrame,
 
     def fit_transform(self, data: pd.DataFrame,
                       column_names: list) -> pd.DataFrame:
-        """Fits to data, then transform it
+        """Fit to data, then transform it.
 
         Parameters
         ----------
@@ -357,8 +358,7 @@ def fit_transform(self, data: pd.DataFrame,
     def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
                            n_bins: int, col_min: float,
                            col_max: float) -> list:
-        """Compute the bin edges for a given column, a DataFrame and the number
-        of required bins
+        """Compute the desired bin edges.
 
         Parameters
         ----------
@@ -378,7 +378,6 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
         list
             list of bin edges from which to compute the bins
         """
-
         bin_edges = []
         if self.strategy == "quantile":
             bin_edges = list(data[column_name]
@@ -411,8 +410,10 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
         return list(dict.fromkeys(bin_edges))
 
     def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int:
-        """Compute the minimal precision of a list of bin_edges so that we end
-        up with a strictly ascending sequence of different numbers even when rounded.
+        """Compute the minimal precision of a list of bin_edges.
+
+        This way we end up with a strictly ascending sequence of
+        different numbers even when rounded.
         The starting_precision attribute will be used as the initial precision.
         In case of a negative starting_precision, the bin edges will be rounded
         to the nearest 10, 100, ... (e.g. 5.55 -> 10, 246 -> 200, ...)
@@ -427,7 +428,6 @@ def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int:
         int
             minimal precision for the bin edges
         """
-
         precision = self.starting_precision
         while True:
             cont = False
@@ -443,8 +443,8 @@ def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int:
                 return precision
 
     def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:
-        """Given a list of bin edges, compute the minimal precision for which
-        we can make meaningful bins and make those bins
+        """
+        Return bins with the minimal precision.
 
         Parameters
         ----------
@@ -471,9 +471,13 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:
         return bins
 
     @staticmethod
-    def _create_index(intervals: List[tuple],
-                      closed: str = "right") -> pd.IntervalIndex:
-        """Create an pd.IntervalIndex based on a list of tuples.
+    def _create_index(
+        intervals: List[tuple],
+        closed: str = "right"
+    ) -> pd.IntervalIndex:
+        """
+        Create an pd.IntervalIndex based on a list of tuples.
+
         This is basically a wrapper around pd.IntervalIndex.from_tuples
         However, the lower bound of the first entry in the list (the lower bin)
         is replaced by -np.inf. Similarly, the upper bound of the last entry in
@@ -492,7 +496,6 @@ def _create_index(intervals: List[tuple],
         pd.IntervalIndex
             Description
         """
-
         # check if closed is of the proper form
         if closed not in ["left", "right"]:
             raise ValueError("{}: valid options for 'closed' are {}. "
@@ -511,8 +514,8 @@ def _create_index(intervals: List[tuple],
         return pd.IntervalIndex.from_tuples(_intervals, closed)
 
     def _create_bin_labels(self, bins: List[tuple]) -> list:
-        """Given a list of bins, create a list of string containing the bins
-        as a string with a specific format (e.g. bin labels)
+        """
+        Stringify the bin bounds to be used as bin labels.
 
         Parameters
         ----------
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index e03d352..3ecadf0 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -1,3 +1,4 @@
+"""Preprocess data."""
 
 # standard lib imports
 import inspect
@@ -20,7 +21,10 @@
 log = logging.getLogger(__name__)
 
 class PreProcessor(BaseEstimator):
-    """This class implements a so-called facade pattern to define a
+    """
+    Preprocess data.
+
+    This class implements a so-called facade pattern to define a
     higher-level interface to work with the CategoricalDataProcessor,
     KBinsDiscretizer and TargetEncoder classes, so that their fit and transform
     methods are called in the correct order.
@@ -48,12 +52,14 @@ class PreProcessor(BaseEstimator):
         (``classification`` or ``regression``).
     """
 
-    def __init__(self,
-                 categorical_data_processor: CategoricalDataProcessor,
-                 discretizer: KBinsDiscretizer,
-                 target_encoder: TargetEncoder,
-                 is_fitted: bool = False):
-
+    def __init__(
+        self,
+        categorical_data_processor: CategoricalDataProcessor,
+        discretizer: KBinsDiscretizer,
+        target_encoder: TargetEncoder,
+        is_fitted: bool = False
+    ):
+        """Initialize the PreProcessor class."""
         self._categorical_data_processor = categorical_data_processor
         self._discretizer = discretizer
         self._target_encoder = target_encoder
@@ -63,27 +69,28 @@ def __init__(self,
         self.model_type = categorical_data_processor.model_type
 
     @classmethod
-    def from_params(cls,
-                    model_type: str="classification",
-                    n_bins: int=10,
-                    strategy: str="quantile",
-                    closed: str="right",
-                    auto_adapt_bins: bool=False,
-                    starting_precision: int=0,
-                    label_format: str="{} - {}",
-                    change_endpoint_format: bool=False,
-                    regroup: bool=True,
-                    regroup_name: str="Other",
-                    keep_missing: bool=True,
-                    category_size_threshold: int=5,
-                    p_value_threshold: float=0.001,
-                    scale_contingency_table: bool=True,
-                    forced_categories: dict={},
-                    weight: float=0.0,
-                    imputation_strategy: str="mean"):
-        """Constructor to instantiate PreProcessor from all the parameters
-        that can be set in all its required (attribute) classes
-        along with good default values.
+    def from_params(
+        cls,
+        model_type: str="classification",
+        n_bins: int=10,
+        strategy: str="quantile",
+        closed: str="right",
+        auto_adapt_bins: bool=False,
+        starting_precision: int=0,
+        label_format: str="{} - {}",
+        change_endpoint_format: bool=False,
+        regroup: bool=True,
+        regroup_name: str="Other",
+        keep_missing: bool=True,
+        category_size_threshold: int=5,
+        p_value_threshold: float=0.001,
+        scale_contingency_table: bool=True,
+        forced_categories: dict={},
+        weight: float=0.0,
+        imputation_strategy: str="mean"
+    ):
+        """
+        Instantiate a PreProcessor from given or default params.
 
         Parameters
         ----------
@@ -168,8 +175,11 @@ def from_params(cls,
 
     @classmethod
     def from_pipeline(cls, pipeline: dict):
-        """Constructor to instantiate PreProcessor from a (fitted) pipeline
-        which was stored as a JSON file and passed to this function as a dict.
+        """
+        Instantiate a PreProcessor from a (fitted) pipeline.
+
+        The pipeline should be stored as a JSON file and passed to this function
+        as a dict.
 
         Parameters
         ----------
@@ -187,7 +197,6 @@ def from_pipeline(cls, pipeline: dict):
             If the loaded pipeline does not have all required parameters
             and no others.
         """
-
         if not PreProcessor._is_valid_pipeline(pipeline):
             raise ValueError("Invalid pipeline, as it does not "
                              "contain all and only the required parameters.")
@@ -222,7 +231,6 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list,
         target_column_name : str
             Column name of the target.
         """
-
         # get list of all variables
         preprocessed_variable_names = (PreProcessor
                                        ._get_variable_list(continuous_vars,
@@ -290,7 +298,6 @@ def transform(self, data: pd.DataFrame, continuous_vars: list,
         NotFittedError
             In case PreProcessor was not fitted first.
         """
-
         start = time.time()
 
         if not self._is_fitted:
@@ -339,7 +346,6 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list,
         pd.DataFrame
             Transformed (preprocessed) data.
         """
-
         self.fit(train_data, continuous_vars, discrete_vars,
                  target_column_name)
 
@@ -350,8 +356,7 @@ def train_selection_validation_split(data: pd.DataFrame,
                                          train_prop: float=0.6,
                                          selection_prop: float=0.2,
                                          validation_prop: float=0.2) -> pd.DataFrame:
-        """Adds `split` column with train/selection/validation values
-        to the dataset.
+        """Add `split` column with train/selection/validation values to the dataset.
 
         Train set = data on which the model is trained and on which the encoding is based.
         Selection set = data used for univariate and forward feature selection. Often called the validation set.
@@ -401,7 +406,10 @@ def train_selection_validation_split(data: pd.DataFrame,
         return data
 
     def serialize_pipeline(self) -> dict:
-        """Serialize the preprocessing pipeline by writing all its required
+        """
+        Serialize the preprocessing pipeline.
+
+        This is done by writing all its required
         parameters to a dictionary to later store it as a JSON file.
 
         Returns
@@ -429,8 +437,7 @@ def serialize_pipeline(self) -> dict:
 
     @staticmethod
     def _is_valid_pipeline(pipeline: dict) -> bool:
-        """Validate the loaded pipeline by checking if all required parameters
-        are present (and no others!).
+        """Validate the loaded pipeline by checking if only the required parameters are present.
 
         Parameters
         ----------
@@ -456,8 +463,9 @@ def _is_valid_pipeline(pipeline: dict) -> bool:
 
     @staticmethod
     def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list:
-        """Merge lists of continuous_vars and discrete_vars and add suffix
-        "_bin" resp. "_processed" to the predictors.
+        """Merge lists of continuous_vars and discrete_vars.
+
+        Suffixes "_bin" resp. "_processed" are added to the predictors.
 
         Parameters
         ----------
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index 3eda39d..0863ae6 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -1,3 +1,4 @@
+"""Target encoding."""
 
 import logging
 
@@ -9,7 +10,10 @@
 log = logging.getLogger(__name__)
 
 class TargetEncoder(BaseEstimator):
-    """Target encoding for categorical features, inspired by
+    """
+    Target encoding for categorical features.
+
+    Inspired by
     http://contrib.scikit-learn.org/category_encoders/targetencoder.html.
 
     Replace each value of the categorical feature with the average of the
@@ -62,9 +66,11 @@ class TargetEncoder(BaseEstimator):
 
     valid_imputation_strategies = ("mean", "min", "max")
 
-    def __init__(self, weight: float=0.0,
-                 imputation_strategy: str="mean"):
-
+    def __init__(
+        self, weight: float=0.0,
+        imputation_strategy: str="mean"
+    ):
+        """Initialize the TargetEncoder class."""
         if weight < 0:
             raise ValueError("The value of weight cannot be smaller than zero.")
         elif imputation_strategy not in self.valid_imputation_strategies:
@@ -107,8 +113,7 @@ def attributes_to_dict(self) -> dict:
         return params
 
     def set_attributes_from_dict(self, params: dict):
-        """Set instance attributes from a dictionary of values with key the
-        name of the attribute.
+        """Set instance attributes from a dictionary.
 
         Parameters
         ----------
@@ -309,8 +314,11 @@ def fit_transform(self, data: pd.DataFrame,
 
     @staticmethod
     def _clean_column_name(column_name: str) -> str:
-        """Generate a name for the new column that this target encoder
-        generates in the given data, by removing "_bin", "_processed" or
+        """
+        Generate a clean name.
+
+        Cleans the name generated by the target encoder
+        in the given data, by removing "_bin", "_processed" or
         "_cleaned" from the original categorical column, and adding "_enc".
 
         Parameters
diff --git a/cobra/utils.py b/cobra/utils.py
index f394caf..c681cdf 100644
--- a/cobra/utils.py
+++ b/cobra/utils.py
@@ -1,6 +1,11 @@
+"""Cobra utils."""
+
 def clean_predictor_name(predictor_name: str) -> str:
-    """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end
-    of the predictor name to return a clean version of the predictor
+    """
+    Clean the predictor name.
+
+    This is done by stripping the redundant suffix (e.g. "_enc" or "_bin") off
+    from the end of the predictor name to return a clean version of the predictor
     """
     return (predictor_name.replace("_enc", "")
                           .replace("_bin", "")

From 9ab342aab6761e6708c5846aa106499d6d9cc61c Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 22 Apr 2022 09:19:03 +0200
Subject: [PATCH 3/9] fix: codestyle and some linter issues

---
 .pylintrc                                     | 585 ++++++++++++++++++
 cobra/__init__.py                             |   2 +-
 cobra/evaluation/evaluator.py                 | 120 +++-
 cobra/evaluation/pigs_tables.py               |  41 +-
 cobra/evaluation/plotting_utils.py            |  58 +-
 cobra/model_building/forward_selection.py     |  46 +-
 cobra/model_building/models.py                |  28 +-
 cobra/model_building/univariate_selection.py  |   5 +-
 .../categorical_data_processor.py             |  26 +-
 cobra/preprocessing/kbins_discretizer.py      |   9 +-
 cobra/preprocessing/preprocessor.py           |  61 +-
 cobra/preprocessing/target_encoder.py         |   6 +-
 cobra/utils.py                                |   1 +
 cobra/version.py                              |   2 +-
 setup.cfg                                     |   2 +
 15 files changed, 851 insertions(+), 141 deletions(-)
 create mode 100644 .pylintrc
 create mode 100644 setup.cfg

diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..ee9601a
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,585 @@
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Files or directories to be skipped. They should be base names, not
+# paths.
+ignore=CVS
+
+# Add files or directories matching the regex patterns to the ignore-list. The
+# regex matches against paths and can be in Posix or Windows format.
+ignore-paths=
+
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths.
+ignore-patterns=^\.#
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+    pylint.extensions.check_elif,
+    pylint.extensions.bad_builtin,
+    pylint.extensions.docparams,
+    pylint.extensions.for_any_all,
+    pylint.extensions.set_membership,
+    pylint.extensions.code_style,
+    pylint.extensions.overlapping_exceptions,
+    pylint.extensions.typing,
+    pylint.extensions.redefined_variable_type,
+    pylint.extensions.comparison_placement,
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-allow-list=
+
+# Minimum supported python version
+py-version = 3.7.2
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# Specify a score threshold to be exceeded before program exits with error.
+fail-under=10.0
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+# confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=
+    use-symbolic-message-instead,
+    useless-suppression,
+    fixme
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+
+disable=
+    attribute-defined-outside-init,
+    duplicate-code,
+    invalid-name,
+    missing-docstring,
+    protected-access,
+    too-few-public-methods,
+    # handled by black
+    format,
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables 'fatal', 'error', 'warning', 'refactor', 'convention'
+# and 'info', which contain the number of messages in each category, as
+# well as 'statement', which is the total number of statements analyzed. This
+# score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+# Activate the evaluation score.
+score=yes
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+# Regular expression of note tags to take in consideration.
+#notes-rgx=
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Signatures are removed from the similarity computation
+ignore-signatures=no
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=_$|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Maximum number of lines in a module
+max-module-lines=2000
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_,
+    ax,
+    cv,
+    df,
+    exc,
+    i,
+    j,
+    l,
+    lr,
+    m,
+    n,
+    q,
+    qq,
+    s,
+    t,
+    v,
+    x,
+    X,
+    X_train,
+    X_test,
+    y,
+
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=foo,bar,baz,toto,tutu,tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,}$
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style.
+#class-const-rgx=
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,}$
+
+# Regular expression which can overwrite the naming style set by typevar-naming-style.
+#typevar-rgx=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring. Use ^(?!__init__$)_ to also check __init__.
+no-docstring-rgx=__.*__
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# List of decorators that define properties, such as abc.abstractproperty.
+property-classes=abc.abstractproperty
+
+
+[TYPECHECK]
+
+# Regex pattern to define which classes are considered mixins if ignore-mixin-
+# members is set to 'yes'
+mixin-class-rgx=.*MixIn
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=REQUEST,acl_users,aq_parent,argparse.Namespace
+
+# List of decorators that create context managers from functions, such as
+# contextlib.contextmanager.
+contextmanager-decorators=contextlib.contextmanager
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# List of comma separated words that should be considered directives if they
+# appear and the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=10
+
+# Maximum number of locals for function / method body
+max-locals=25
+
+# Maximum number of return / yield for function / method body
+max-returns=11
+
+# Maximum number of branch for function / method body
+max-branches=27
+
+# Maximum number of statements in function / method body
+max-statements=100
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# List of qualified class names to ignore when counting class parents (see R0901).
+ignored-parents=
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=11
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=25
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# List of regular expressions of class ancestor names to
+# ignore when counting public methods (see R0903).
+exclude-too-few-public-methods=
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp,__post_init__
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,TERMIOS,Bastion,rexec
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
+
+
+[TYPING]
+
+# Set to ``no`` if the app / library does **NOT** need to support runtime
+# introspection of type annotations. If you use type annotations
+# **exclusively** for type checking of an application, you're probably fine.
+# For libraries, evaluate if some users what to access the type hints at
+# runtime first, e.g., through ``typing.get_type_hints``. Applies to Python
+# versions 3.7 - 3.9
+runtime-typing = no
+
+
+[DEPRECATED_BUILTINS]
+
+# List of builtins function names that should not be used, separated by a comma
+bad-functions=map,input
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[CODE_STYLE]
+
+# Max line length for which to sill emit suggestions. Used to prevent optional
+# suggestions which would get split by a code formatter (e.g., black). Will
+# default to the setting for ``max-line-length``.
+#max-line-length-suggestions=
\ No newline at end of file
diff --git a/cobra/__init__.py b/cobra/__init__.py
index 8afad45..451287b 100644
--- a/cobra/__init__.py
+++ b/cobra/__init__.py
@@ -1,3 +1,3 @@
 """Cobra module."""
 
-from .version import __version__
\ No newline at end of file
+from .version import __version__
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index f550431..41974e7 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -26,6 +26,10 @@
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics import r2_score
 
+
+DEFAULT_LABELS = ["0", "1"]
+
+
 class ClassificationEvaluator():
     """Evaluator class encapsulating classification model metrics and plotting functionality.
 
@@ -58,8 +62,8 @@ class ClassificationEvaluator():
 
     def __init__(
         self,
-        probability_cutoff: float=None,
-        lift_at: float=0.05,
+        probability_cutoff: float = None,
+        lift_at: float = 0.05,
         n_bins: int = 10
     ):
         """Initialize the ClassificationEvaluator."""
@@ -144,6 +148,12 @@ def _compute_scalar_metrics(y_true: np.ndarray,
                 F1
                 Matthews correlation coefficient
                 Lift at given percentage
+
+        Raises
+        ----------
+        ValueError
+            The `column_order` and `pig_tables` parameters do not contain
+            the same set of variables.
         """
         return pd.Series({
             "accuracy": accuracy_score(y_true, y_pred_b),
@@ -152,13 +162,16 @@ def _compute_scalar_metrics(y_true: np.ndarray,
             "recall": recall_score(y_true, y_pred_b),
             "F1": f1_score(y_true, y_pred_b, average=None)[1],
             "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
-            "lift at {}".format(lift_at): np.round(ClassificationEvaluator
-                                                   ._compute_lift(y_true=y_true,
-                                                                  y_pred=y_pred,
-                                                                  lift_at=lift_at), 2)
+            f"lift at {lift_at}": np.round(
+                ClassificationEvaluator
+                ._compute_lift(
+                    y_true=y_true,
+                    y_pred=y_pred,
+                    lift_at=lift_at
+                ), 2)
         })
 
-    def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
+    def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
         """Plot ROC curve of the model.
 
         Parameters
@@ -167,6 +180,11 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
             Path to store the figure.
         dim : tuple, optional
             Tuple with width and length of the plot.
+
+        Raises
+        ----------
+        NotFittedError
+            The instance is not fitted yet.
         """
         if self.roc_curve is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
@@ -178,12 +196,12 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
 
         with plt.style.context("seaborn-whitegrid"):
 
-            fig, ax = plt.subplots(figsize=dim)
+            fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             ax.plot(self.roc_curve["fpr"],
                     self.roc_curve["tpr"],
                     color="cornflowerblue", linewidth=3,
-                    label="ROC curve (area = {s:.3})".format(s=auc))
+                    label=f"ROC curve (area = {auc:.3})")
 
             ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3,
                     linestyle="--")
@@ -197,8 +215,11 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
 
         plt.show()
 
-    def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
-                              labels: list=["0", "1"]):
+    def plot_confusion_matrix(
+        self, path: str = None,
+        dim: tuple = (12, 8),
+        labels: list = None
+    ):
         """Plot the confusion matrix.
 
         Parameters
@@ -209,14 +230,20 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
             Tuple with width and length of the plot.
         labels : list, optional
             Optional list of labels, default "0" and "1".
+
+        Raises
+        ----------
+        NotFittedError
+            The instance is not fitted yet.
         """
+        labels = labels or DEFAULT_LABELS
         if self.confusion_matrix is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
 
             raise NotFittedError(msg.format(self.__class__.__name__))
 
-        fig, ax = plt.subplots(figsize=dim)
+        fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
         ax = sns.heatmap(self.confusion_matrix,
                          annot=self.confusion_matrix.astype(str),
                          fmt="s", cmap="Blues",
@@ -228,7 +255,7 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
 
         plt.show()
 
-    def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
+    def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8)):
         """Plot cumulative response curve.
 
         Parameters
@@ -237,6 +264,11 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
             Path to store the figure.
         dim : tuple, optional
             Tuple with width and length of the plot.
+
+        Raises
+        ----------
+        NotFittedError
+            The instance is not fitted yet.
         """
         if self.lift_curve is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
@@ -249,7 +281,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
         lifts = np.array(lifts)*inc_rate*100
 
         with plt.style.context("seaborn-ticks"):
-            fig, ax = plt.subplots(figsize=dim)
+            fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             plt.bar(x_labels[::-1], lifts, align="center",
                     color="cornflowerblue")
@@ -278,7 +310,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
 
             plt.show()
 
-    def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
+    def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
         """Plot lift per decile.
 
         Parameters
@@ -287,6 +319,11 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
             Path to store the figure.
         dim : tuple, optional
             Tuple with width and length of the plot.
+
+        Raises
+        ----------
+        NotFittedError
+            The instance is not fitted yet.
         """
         if self.lift_curve is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
@@ -297,7 +334,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
         x_labels, lifts, _ = self.lift_curve
 
         with plt.style.context("seaborn-ticks"):
-            fig, ax = plt.subplots(figsize=dim)
+            fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             plt.bar(x_labels[::-1], lifts, align="center",
                     color="cornflowerblue")
@@ -326,7 +363,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
 
             plt.show()
 
-    def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
+    def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)):
         """Plot cumulative gains per decile.
 
         Parameters
@@ -337,7 +374,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
             Tuple with width and length of the plot.
         """
         with plt.style.context("seaborn-whitegrid"):
-            fig, ax = plt.subplots(figsize=dim)
+            fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100,
                     color="cornflowerblue", linewidth=3,
@@ -354,11 +391,11 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
             # Format ticks
             ticks_loc_y = ax.get_yticks().tolist()
             ax.yaxis.set_major_locator(mticker.FixedLocator(ticks_loc_y))
-            ax.set_yticklabels(["{:3.0f}%".format(x) for x in ticks_loc_y])
+            ax.set_yticklabels([f"{x:3.0f}%" for x in ticks_loc_y])
 
             ticks_loc_x = ax.get_xticks().tolist()
             ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc_x))
-            ax.set_xticklabels(["{:3.0f}%".format(x) for x in ticks_loc_x])
+            ax.set_xticklabels([f"{x:3.0f}%" for x in ticks_loc_x])
 
             # Legend
             ax.legend(loc="lower right")
@@ -384,8 +421,8 @@ def _find_optimal_cutoff(y_true: np.ndarray,
         float
             Optimal cut-off probability for the model.
         """
-        return ClassificationEvaluator._compute_optimal_cutoff(roc_curve(y_true=y_true,
-                                                                         y_score=y_pred))
+        fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred)
+        return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
 
     @staticmethod
     def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray,
@@ -455,9 +492,11 @@ def _compute_cumulative_gains(y_true: np.ndarray,
         return percentages, gains
 
     @staticmethod
-    def _compute_lift_per_bin(y_true: np.ndarray,
-                              y_pred: np.ndarray,
-                              n_bins: int=10) -> tuple:
+    def _compute_lift_per_bin(
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        n_bins: int = 10
+    ) -> tuple:
         """Compute lift of the model for a given number of bins.
 
         Parameters
@@ -485,8 +524,11 @@ def _compute_lift_per_bin(y_true: np.ndarray,
         return x_labels, lifts, y_true.mean()
 
     @staticmethod
-    def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray,
-                      lift_at: float=0.05) -> float:
+    def _compute_lift(
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        lift_at: float = 0.05
+    ) -> float:
         """Calculate lift on a specified level.
 
         Parameters
@@ -619,7 +661,7 @@ def _compute_qq_residuals(y_true: np.ndarray,
         pd.Series
             Theoretical quantiles and associated actual residuals.
         """
-        ## also possible directly via statsmodels.api.qqplot()
+        # also possible directly via statsmodels.api.qqplot()
 
         n = len(y_true)
 
@@ -636,7 +678,7 @@ def _compute_qq_residuals(y_true: np.ndarray,
             "residuals": df["z_res"].values,
         })
 
-    def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
+    def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
         """Plot predictions from the model against actual values.
 
         Parameters
@@ -645,17 +687,24 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
             Path to store the figure.
         dim : tuple, optional
             Tuple with width and length of the plot.
+
+        Raises
+        ----------
+        NotFittedError
+            The instance is not fitted yet.
         """
         if self.y_true is None and self.y_pred is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
 
+            raise NotFittedError(msg.format(self.__class__.__name__))
+
         y_true = self.y_true
         y_pred = self.y_pred
 
         with plt.style.context("seaborn-whitegrid"):
 
-            fig, ax = plt.subplots(figsize=dim)
+            fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             x = np.arange(1, len(y_true)+1)
 
@@ -672,7 +721,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
 
         plt.show()
 
-    def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
+    def plot_qq(self, path: str = None, dim: tuple = (12, 8)):
         """Display a Q-Q plot from the standardized prediction residuals.
 
         Parameters
@@ -681,6 +730,11 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
             Path to store the figure.
         dim : tuple, optional
             Tuple with width and length of the plot.
+
+        Raises
+        ----------
+        NotFittedError
+            The instance is not fitted yet.
         """
         if self.qq is None:
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
@@ -690,7 +744,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
 
         with plt.style.context("seaborn-whitegrid"):
 
-            fig, ax = plt.subplots(figsize=dim)
+            fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             x = self.qq["quantiles"]
             y = self.qq["residuals"]
@@ -710,4 +764,4 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
 
-        plt.show()
\ No newline at end of file
+        plt.show()
diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
index 8915c5e..5503349 100644
--- a/cobra/evaluation/pigs_tables.py
+++ b/cobra/evaluation/pigs_tables.py
@@ -6,12 +6,15 @@
 import numpy as np
 from matplotlib.ticker import FuncFormatter
 
-import cobra.utils as utils
+from cobra import utils
 
-def generate_pig_tables(basetable: pd.DataFrame,
-                        id_column_name: str,
-                        target_column_name: str,
-                        preprocessed_predictors: list) -> pd.DataFrame:
+
+def generate_pig_tables(
+    basetable: pd.DataFrame,
+    id_column_name: str,
+    target_column_name: str,
+    preprocessed_predictors: list
+) -> pd.DataFrame:
     """Compute PIG tables for all predictors in preprocessed_predictors.
 
     The output is a DataFrame with columns ``variable``, ``label``,
@@ -94,13 +97,15 @@ def compute_pig_table(basetable: pd.DataFrame,
     return res[column_order]
 
 
-def plot_incidence(pig_tables: pd.DataFrame,
-                   variable: str,
-                   model_type: str,
-                   column_order: list=None,
-                   dim: tuple=(12, 8)):
+def plot_incidence(
+    pig_tables: pd.DataFrame,
+    variable: str,
+    model_type: str,
+    column_order: list = None,
+    dim: tuple = (12, 8)
+):
     """Plot a Predictor Insights Graph (PIG).
-    
+
     A PIG is a graph in which the mean
     target value is plotted for a number of bins constructed from a predictor
     variable. When the target is a binary classification target,
@@ -123,6 +128,12 @@ def plot_incidence(pig_tables: pd.DataFrame,
         on the PIG.
     dim: tuple, default=(12, 8)
         Optional tuple to configure the width and length of the plot.
+
+    Raises
+    ----------
+    ValueError
+        The `column_order` and `pig_tables` parameters do not contain
+        the same set of variables.
     """
     if model_type not in ["classification", "regression"]:
         raise ValueError("An unexpected value was set for the model_type "
@@ -170,7 +181,7 @@ def plot_incidence(pig_tables: pd.DataFrame,
         # Set labels & ticks
         ax.set_ylabel('incidence' if model_type == "classification" else "mean target value",
                       fontsize=16)
-        ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
+        ax.set_xlabel(f'{variable} bins' '', fontsize=16)
         ax.xaxis.set_tick_params(labelsize=14)
         plt.setp(ax.get_xticklabels(),
                  rotation=45, ha="right", rotation_mode="anchor")
@@ -181,7 +192,7 @@ def plot_incidence(pig_tables: pd.DataFrame,
             # so format them as percentages
             ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05))
             ax.yaxis.set_major_formatter(
-                FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
+                FuncFormatter(lambda y, _: f'{y:.1%}'))
         elif model_type == "regression":
             # If the difference between the highest avg. target of all bins
             # versus the global avg. target AND the difference between the
@@ -213,12 +224,12 @@ def plot_incidence(pig_tables: pd.DataFrame,
                 align='center', color="#939598", zorder=1)
 
         # Set labels & ticks
-        ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16)
+        ax2.set_xlabel(f'{variable} bins' '', fontsize=16)
         ax2.xaxis.set_tick_params(rotation=45, labelsize=14)
 
         ax2.yaxis.set_tick_params(labelsize=14)
         ax2.yaxis.set_major_formatter(
-            FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
+            FuncFormatter(lambda y, _: f'{y:.1%}'))
         ax2.set_ylabel('population size', fontsize=16)
         ax2.tick_params(axis='y', colors="#939598")
         ax2.yaxis.label.set_color('#939598')
diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py
index 5aaf1a2..8f0a6b0 100644
--- a/cobra/evaluation/plotting_utils.py
+++ b/cobra/evaluation/plotting_utils.py
@@ -7,9 +7,19 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 
-def plot_univariate_predictor_quality(df_metric: pd.DataFrame,
-                                      dim: tuple=(12, 8),
-                                      path: str=None):
+
+DEFAULT_COLOURS = {
+    "train": "#0099bf",
+    "selection": "#ff9500",
+    "validation": "#8064a2"
+}
+
+
+def plot_univariate_predictor_quality(
+    df_metric: pd.DataFrame,
+    dim: tuple = (12, 8),
+    path: str = None
+):
     """Plot univariate quality of the predictors.
 
     Parameters
@@ -40,7 +50,7 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame,
 
     # plot data
     with plt.style.context("seaborn-ticks"):
-        fig, ax = plt.subplots(figsize=dim)
+        fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
         ax = sns.barplot(x=metric, y="predictor", hue="split", data=df)
         ax.set_title("Univariate Quality of Predictors")
@@ -56,9 +66,12 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame,
 
         plt.show()
 
-def plot_correlation_matrix(df_corr: pd.DataFrame,
-                            dim: tuple=(12, 8),
-                            path: str=None):
+
+def plot_correlation_matrix(
+    df_corr: pd.DataFrame,
+    dim: tuple = (12, 8),
+    path: str = None
+):
     """Plot correlation matrix amongst the predictors.
 
     Parameters
@@ -70,7 +83,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame,
     path : str, optional
         Path to store the figure.
     """
-    fig, ax = plt.subplots(figsize=dim)
+    fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
     ax = sns.heatmap(df_corr, cmap='Blues')
     ax.set_title('Correlation Matrix')
 
@@ -79,13 +92,14 @@ def plot_correlation_matrix(df_corr: pd.DataFrame,
 
     plt.show()
 
-def plot_performance_curves(model_performance: pd.DataFrame,
-                            dim: tuple=(12, 8),
-                            path: str=None,
-                            colors: dict={"train": "#0099bf",
-                                          "selection": "#ff9500",
-                                          "validation": "#8064a2"},
-                            metric_name: str=None):
+
+def plot_performance_curves(
+    model_performance: pd.DataFrame,
+    dim: tuple = (12, 8),
+    path: str = None,
+    colors: dict = None,
+    metric_name: str = None
+):
     """Plot performance curves for the train-selection-validation sets.
 
     Parameters
@@ -104,6 +118,7 @@ def plot_performance_curves(model_performance: pd.DataFrame,
         Defaults to RMSE in case of regression and AUC in case of
         classification.
     """
+    colors = colors or DEFAULT_COLOURS
     model_type = model_performance["model_type"][0]
 
     if metric_name is None:
@@ -155,10 +170,13 @@ def plot_performance_curves(model_performance: pd.DataFrame,
 
         plt.show()
 
-def plot_variable_importance(df_variable_importance: pd.DataFrame,
-                             title: str=None,
-                             dim: tuple=(12, 8),
-                             path: str=None):
+
+def plot_variable_importance(
+    df_variable_importance: pd.DataFrame,
+    title: str = None,
+    dim: tuple = (12, 8),
+    path: str = None
+):
     """Plot variable importance of a given model.
 
     Parameters
@@ -173,7 +191,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame,
         Path to store the figure.
     """
     with plt.style.context("seaborn-ticks"):
-        fig, ax = plt.subplots(figsize=dim)
+        fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
         ax = sns.barplot(x="importance", y="predictor",
                          data=df_variable_importance,
                          color="cornflowerblue")
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index 693fed3..ed5f119 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -10,6 +10,12 @@
 
 log = logging.getLogger(__name__)
 
+
+DEFAULT_SPLIT_NAMES = ["train", "selection", "validation"]
+DEFAULT_FORCED_PREDICTORS = []
+DEFAULT_EXCLUDED_PREDICTORS = []
+
+
 class ForwardFeatureSelection:
     """Perform forward feature selection for a given dataset using a given algorithm.
 
@@ -37,9 +43,9 @@ class ForwardFeatureSelection:
 
     def __init__(
         self,
-        model_type: str="classification",
-        max_predictors: int=50,
-        pos_only: bool=True
+        model_type: str = "classification",
+        max_predictors: int = 50,
+        pos_only: bool = True
     ):
         """Initialize the ForwardFeatureSelection class."""
         self.model_type = model_type
@@ -80,12 +86,12 @@ def get_model_from_step(self, step: int):
     def compute_model_performances(
         self, data: pd.DataFrame,
         target_column_name: str,
-        splits: list=["train", "selection", "validation"],
-        metric: Optional[Callable]=None,
+        splits: list = None,
+        metric: Optional[Callable] = None,
     ) -> pd.DataFrame:
         """
         Compute for each model the performance for different sets.
-        
+
         Different sets could be cross validation, train-selection-validation, ...
         Note that the computation of the
         performance for each split is cached inside the model itself, so it
@@ -99,7 +105,7 @@ def compute_model_performances(
             Name of the target column.
         splits : list, optional
             List of splits to compute performance on.
-        metric: Callable (function), optional
+        metric : Callable (function), optional
             Function that computes an evaluation metric to evaluate the model's
             performances, instead of the default metric (AUC for
             classification, RMSE for regression).
@@ -113,6 +119,7 @@ def compute_model_performances(
             Contains for each model the performance for train, selection and
             validation sets as well as the set of predictors used in this model.
         """
+        splits = splits or DEFAULT_SPLIT_NAMES
         results = []
         predictor_set = set([])
 
@@ -145,9 +152,13 @@ def compute_model_performances(
 
         return df
 
-    def fit(self, train_data: pd.DataFrame, target_column_name: str,
-            predictors: list, forced_predictors: list=[],
-            excluded_predictors: list=[]):
+    def fit(
+        self, train_data: pd.DataFrame,
+        target_column_name: str,
+        predictors: list,
+        forced_predictors: list = None,
+        excluded_predictors: list = None
+    ):
         """Fit the forward feature selection estimator.
 
         Parameters
@@ -178,6 +189,8 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
             "The train_data input df does not include a 'train' and 'selection' split."
 
         # remove excluded predictors from predictor lists
+        forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS
+        excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS
         filtered_predictors = [var for var in predictors
                                if (var not in excluded_predictors and
                                    var not in forced_predictors)]
@@ -205,10 +218,10 @@ def _forward_selection(
         train_data: pd.DataFrame,
         target_column_name: str,
         predictors: list,
-        forced_predictors: list = []
+        forced_predictors: list = None
     ) -> list:
         """Perform the forward feature selection algorithm.
-        
+
         The algorithm will compute a list of models (with increasing performance).
         The length of the list, i.e. the number of models,
         is bounded by the max_predictors class
@@ -231,6 +244,7 @@ def _forward_selection(
             List of fitted models where the index of the list indicates the
             number of predictors minus one (as indices start from 0).
         """
+        forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS
         fitted_models = []
         current_predictors = []
 
@@ -279,7 +293,7 @@ def _find_next_best_model(
     ):
         """
         Find the next best model with candidate predictors.
-        
+
         Given a list of current predictors which are already selected to
         be include in the model, find amongst a list candidate predictors
         the predictor to add to the selected list so that the resulting model
@@ -300,6 +314,12 @@ def _find_next_best_model(
         -------
         self.MLModel
             Best performing model.
+
+        Raises
+        ----------
+        ValueError
+            The `column_order` and `pig_tables` parameters do not contain
+            the same set of variables.
         """
         # placeholders
         best_model = None
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 7c55acf..cad6381 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -6,15 +6,15 @@
 import numpy as np
 import pandas as pd
 from scipy import stats
-from sklearn.metrics import roc_auc_score, mean_squared_error
+from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve
 from numpy import sqrt
 from sklearn.linear_model import LogisticRegression, LinearRegression
-from sklearn.metrics import roc_curve
 
 # custom imports
 import cobra.utils as utils
 from cobra.evaluation import ClassificationEvaluator
 
+
 class LogisticRegressionModel:
     """
     Cobra's LogisticRegression model.
@@ -151,9 +151,11 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         # ensure we have the proper predictors and the proper order
         return self.logit.predict_proba(X[self.predictors])[:, 1]
 
-    def evaluate(self, X: pd.DataFrame, y: pd.Series,
-                 split: str=None,
-                 metric: Optional[Callable]=None) -> float:
+    def evaluate(
+        self, X: pd.DataFrame, y: pd.Series,
+        split: str = None,
+        metric: Optional[Callable] = None
+    ) -> float:
         """
         Evaluate the model on a given dataset (X, y).
 
@@ -383,9 +385,11 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         # ensure we have the proper predictors and the proper order
         return self.linear.predict(X[self.predictors])
 
-    def evaluate(self, X: pd.DataFrame, y: pd.Series,
-                 split: str=None,
-                 metric: Optional[Callable]=None) -> float:
+    def evaluate(
+        self, X: pd.DataFrame, y: pd.Series,
+        split: str = None,
+        metric: Optional[Callable] = None
+    ) -> float:
         """Evaluate the model on a given dataset (X, y).
 
         The optional split
@@ -425,8 +429,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
 
                 if split is None:
                     return performance
-                else:
-                    self._eval_metrics_by_split[split] = performance
+                self._eval_metrics_by_split[split] = performance
 
         return self._eval_metrics_by_split[split]
 
@@ -460,7 +463,8 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         return (df.sort_values(by="importance", ascending=False)
                 .reset_index(drop=True))
 
-    def _is_valid_dict(self, model_dict: dict) -> bool:
+    @staticmethod
+    def _is_valid_dict(model_dict: dict) -> bool:
         """Check if the model dictionary is valid."""
         if ("meta" not in model_dict
                 or model_dict["meta"] != "linear-regression"):
@@ -468,7 +472,7 @@ def _is_valid_dict(self, model_dict: dict) -> bool:
 
         attr = ["coef_", "intercept_", "predictors"]
         for key in attr:
-            if not (key in model_dict or type(model_dict[key]) != list):
+            if not (key in model_dict or not isinstance(model_dict[key], list)):
                 return False
 
         if ("params" not in model_dict
diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index bb412e3..48c960b 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -5,6 +5,7 @@
 
 import cobra.utils as utils
 
+
 def compute_univariate_preselection(
     target_enc_train_data: pd.DataFrame,
     target_enc_selection_data: pd.DataFrame,
@@ -16,7 +17,7 @@ def compute_univariate_preselection(
     preselect_overtrain_threshold: float = 0.05
 ) -> pd.DataFrame:
     """Perform a preselection of predictors.
-    
+
     The preselection is based on an AUC (in case of
     classification) or a RMSE (in case of regression) threshold of
     a univariate model on a train and selection dataset and return a DataFrame
@@ -130,6 +131,7 @@ def compute_univariate_preselection(
 
     return df_out
 
+
 def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     """Extract a list of predictors from df_metric.
 
@@ -156,6 +158,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
 
     return [col + "_enc" for col in predictor_list]
 
+
 def compute_correlations(target_enc_train_data: pd.DataFrame,
                          predictors: list) -> pd.DataFrame:
     """Compute the correlations amongst the predictors in the DataFrame.
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index c9e906d..bf60079 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -15,6 +15,7 @@
 
 log = logging.getLogger(__name__)
 
+
 class CategoricalDataProcessor(BaseEstimator):
     """Regroup categorical variables based on significance with target variable.
 
@@ -66,18 +67,21 @@ class CategoricalDataProcessor(BaseEstimator):
 
     def __init__(
         self,
-        model_type: str="classification",
-        regroup: bool=True,
-        regroup_name: str="Other",
-        keep_missing: bool=True,
-        category_size_threshold: int=5,
-        p_value_threshold: float=0.001,
-        scale_contingency_table: bool=True,
-        forced_categories: dict={}
+        model_type: str = "classification",
+        regroup: bool = True,
+        regroup_name: str = "Other",
+        keep_missing: bool = True,
+        category_size_threshold: int = 5,
+        p_value_threshold: float = 0.001,
+        scale_contingency_table: bool = True,
+        forced_categories: dict = {}
     ):
         """Initialize the CategoricalDataProcessor."""
         if model_type not in ["classification", "regression"]:
-            raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.")
+            raise ValueError(
+                "An unexpected model_type was provided. "
+                "A valid model_type is either 'classification' or 'regression'."
+                )
 
         self.model_type = model_type
         self.regroup = regroup
@@ -183,7 +187,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
                     target_column) -> set:
         """
         Fit all necessary columns into "Other".
-    
+
         Computes which categories to regroup into "Other"
         for a particular column, and return those that need
         to be kept as-is.
@@ -432,7 +436,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
                          scale_contingency_table: bool) -> float:
         """
         Calculate p-value.
-        
+
         Calculate p-value in order to evaluate whether category of
         interest is significantly different from the rest of the
         categories, given the target variable.
diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py
index 3fe611a..1a903d9 100644
--- a/cobra/preprocessing/kbins_discretizer.py
+++ b/cobra/preprocessing/kbins_discretizer.py
@@ -15,6 +15,7 @@
 
 log = logging.getLogger(__name__)
 
+
 class KBinsDiscretizer(BaseEstimator):
     """
     Discretize continuous values into categorical values.
@@ -399,12 +400,12 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
             log.warning(f"Column {column_name} "
                         "has NaNs present in bin definitions")
 
-        # Make absolutely sure bin edges are ordered, 
+        # Make absolutely sure bin edges are ordered,
         # in very rare situations this wasn't the case
-        # due to rounding in quantile calculation (e.g. 
+        # due to rounding in quantile calculation (e.g.
         # distributions with strong mass for same value)
         bin_edges = sorted(bin_edges)
-        
+
         # Make sure the bin_edges are unique
         # and order remains the same
         return list(dict.fromkeys(bin_edges))
@@ -460,7 +461,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:
         # this can be a negative number, which then
         # rounds numbers to the nearest 10, 100, ...
         precision = self._compute_minimal_precision_of_bin_edges(bin_edges)
-        
+
         bins = []
         for a, b in zip(bin_edges, bin_edges[1:]):
             fmt_a = round(a, precision)
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index 3ecadf0..5aa9bda 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -20,6 +20,7 @@
 
 log = logging.getLogger(__name__)
 
+
 class PreProcessor(BaseEstimator):
     """
     Preprocess data.
@@ -71,23 +72,23 @@ def __init__(
     @classmethod
     def from_params(
         cls,
-        model_type: str="classification",
-        n_bins: int=10,
-        strategy: str="quantile",
-        closed: str="right",
-        auto_adapt_bins: bool=False,
-        starting_precision: int=0,
-        label_format: str="{} - {}",
-        change_endpoint_format: bool=False,
-        regroup: bool=True,
-        regroup_name: str="Other",
-        keep_missing: bool=True,
-        category_size_threshold: int=5,
-        p_value_threshold: float=0.001,
-        scale_contingency_table: bool=True,
-        forced_categories: dict={},
-        weight: float=0.0,
-        imputation_strategy: str="mean"
+        model_type: str = "classification",
+        n_bins: int = 10,
+        strategy: str = "quantile",
+        closed: str = "right",
+        auto_adapt_bins: bool = False,
+        starting_precision: int = 0,
+        label_format: str = "{} - {}",
+        change_endpoint_format: bool = False,
+        regroup: bool = True,
+        regroup_name: str = "Other",
+        keep_missing: bool = True,
+        category_size_threshold: int = 5,
+        p_value_threshold: float = 0.001,
+        scale_contingency_table: bool = True,
+        forced_categories: dict = {},
+        weight: float = 0.0,
+        imputation_strategy: str = "mean"
     ):
         """
         Instantiate a PreProcessor from given or default params.
@@ -154,7 +155,7 @@ def from_params(
         PreProcessor
             Class encapsulating CategoricalDataProcessor,
             KBinsDiscretizer, and TargetEncoder instances.
-        """       
+        """
         categorical_data_processor = CategoricalDataProcessor(model_type,
                                                               regroup,
                                                               regroup_name, keep_missing,
@@ -162,13 +163,13 @@ def from_params(
                                                               p_value_threshold,
                                                               scale_contingency_table,
                                                               forced_categories)
-        
+
         discretizer = KBinsDiscretizer(n_bins, strategy, closed,
                                        auto_adapt_bins,
                                        starting_precision,
                                        label_format,
                                        change_endpoint_format)
-                
+
         target_encoder = TargetEncoder(weight, imputation_strategy)
 
         return cls(categorical_data_processor, discretizer, target_encoder)
@@ -352,10 +353,12 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list,
         return self.transform(train_data, continuous_vars, discrete_vars)
 
     @staticmethod
-    def train_selection_validation_split(data: pd.DataFrame,
-                                         train_prop: float=0.6,
-                                         selection_prop: float=0.2,
-                                         validation_prop: float=0.2) -> pd.DataFrame:
+    def train_selection_validation_split(
+        data: pd.DataFrame,
+        train_prop: float = 0.6,
+        selection_prop: float = 0.2,
+        validation_prop: float = 0.2
+    ) -> pd.DataFrame:
         """Add `split` column with train/selection/validation values to the dataset.
 
         Train set = data on which the model is trained and on which the encoding is based.
@@ -394,10 +397,12 @@ def train_selection_validation_split(data: pd.DataFrame,
         size_valid = int(validation_prop * nrows)
         correction = nrows - (size_train+size_select+size_valid)
 
-        split = ['train'] * size_train \
-                + ['train'] * correction \
-                + ['selection'] * size_select \
-                + ['validation'] * size_valid
+        split = (
+            ['train'] * size_train
+            + ['train'] * correction
+            + ['selection'] * size_select
+            + ['validation'] * size_valid
+        )
 
         shuffle(split)
 
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index 0863ae6..0a9028f 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -7,8 +7,10 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
+
 log = logging.getLogger(__name__)
 
+
 class TargetEncoder(BaseEstimator):
     """
     Target encoding for categorical features.
@@ -67,8 +69,8 @@ class TargetEncoder(BaseEstimator):
     valid_imputation_strategies = ("mean", "min", "max")
 
     def __init__(
-        self, weight: float=0.0,
-        imputation_strategy: str="mean"
+        self, weight: float = 0.0,
+        imputation_strategy: str = "mean"
     ):
         """Initialize the TargetEncoder class."""
         if weight < 0:
diff --git a/cobra/utils.py b/cobra/utils.py
index c681cdf..b7727dd 100644
--- a/cobra/utils.py
+++ b/cobra/utils.py
@@ -1,5 +1,6 @@
 """Cobra utils."""
 
+
 def clean_predictor_name(predictor_name: str) -> str:
     """
     Clean the predictor name.
diff --git a/cobra/version.py b/cobra/version.py
index ff1068c..6849410 100644
--- a/cobra/version.py
+++ b/cobra/version.py
@@ -1 +1 @@
-__version__ = "1.1.0"
\ No newline at end of file
+__version__ = "1.1.0"
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..15fbabe
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[pycodestyle]
+max-line-length = 120
\ No newline at end of file

From 7f82013c6f4efff04da5a2b9e78384aedf5e6dd6 Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 22 Apr 2022 13:57:55 +0200
Subject: [PATCH 4/9] chore: clean up formatting

---
 Makefile                                      |   4 +-
 cobra/evaluation/evaluator.py                 | 148 +++++++-----
 cobra/evaluation/pigs_tables.py               |  77 ++++---
 cobra/evaluation/plotting_utils.py            |  76 ++++---
 cobra/model_building/forward_selection.py     | 141 ++++++++----
 cobra/model_building/models.py                |  18 +-
 cobra/model_building/univariate_selection.py  |  56 +++--
 .../categorical_data_processor.py             | 168 +++++++++-----
 cobra/preprocessing/kbins_discretizer.py      | 205 +++++++++++------
 cobra/preprocessing/preprocessor.py           | 215 ++++++++++++------
 cobra/preprocessing/target_encoder.py         |  81 ++++---
 11 files changed, 772 insertions(+), 417 deletions(-)

diff --git a/Makefile b/Makefile
index b31a1db..29466d4 100644
--- a/Makefile
+++ b/Makefile
@@ -23,11 +23,11 @@ lint:
 	@echo 'lint OK'
 
 lint-minimal:
-	pylint E cobra
+	pylint -E cobra
 	@echo 'lint minimal OK'
 
 typecheck:
-	mypy cobra
+	mypy cobra --allow-redefinition --allow-untyped-globals --ignore-missing-imports
 	@echo 'typecheck OK'
 
 codestyle:
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index 41974e7..3255fa2 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -120,10 +120,12 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
         self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains(y_true, y_pred)
 
     @staticmethod
-    def _compute_scalar_metrics(y_true: np.ndarray,
-                                y_pred: np.ndarray,
-                                y_pred_b: np.ndarray,
-                                lift_at: float) -> pd.Series:
+    def _compute_scalar_metrics(
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_pred_b: np.ndarray,
+        lift_at: float
+    ) -> pd.Series:
         """Compute various scalar performance measures.
 
         Parameters
@@ -187,17 +189,16 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
             The instance is not fitted yet.
         """
         if self.roc_curve is None:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         auc = float(self.scalar_metrics.loc["AUC"])
 
         with plt.style.context("seaborn-whitegrid"):
-
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-
             ax.plot(self.roc_curve["fpr"],
                     self.roc_curve["tpr"],
                     color="cornflowerblue", linewidth=3,
@@ -238,16 +239,19 @@ def plot_confusion_matrix(
         """
         labels = labels or DEFAULT_LABELS
         if self.confusion_matrix is None:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-        ax = sns.heatmap(self.confusion_matrix,
-                         annot=self.confusion_matrix.astype(str),
-                         fmt="s", cmap="Blues",
-                         xticklabels=labels, yticklabels=labels)
+        ax = sns.heatmap(
+            self.confusion_matrix,
+            annot=self.confusion_matrix.astype(str),
+            fmt="s", cmap="Blues",
+            xticklabels=labels, yticklabels=labels
+        )
         ax.set_title("Confusion matrix", fontsize=20)
 
         if path:
@@ -271,27 +275,37 @@ def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8))
             The instance is not fitted yet.
         """
         if self.lift_curve is None:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         x_labels, lifts, inc_rate = self.lift_curve
-
         lifts = np.array(lifts)*inc_rate*100
 
         with plt.style.context("seaborn-ticks"):
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
-            plt.bar(x_labels[::-1], lifts, align="center",
-                    color="cornflowerblue")
+            plt.bar(
+                x_labels[::-1],
+                lifts,
+                align="center",
+                color="cornflowerblue")
             plt.ylabel("response (%)", fontsize=16)
             plt.xlabel("decile", fontsize=16)
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
-            plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--",
-                        xmin=0.05, xmax=0.95, linewidth=3, label="Incidence")
+            plt.axhline(
+                y=inc_rate*100,
+                color="darkorange",
+                linestyle="--",
+                xmin=0.05,
+                xmax=0.95,
+                linewidth=3,
+                label="Incidence"
+            )
 
             # Legend
             ax.legend(loc="upper right")
@@ -326,9 +340,10 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
             The instance is not fitted yet.
         """
         if self.lift_curve is None:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         x_labels, lifts, _ = self.lift_curve
@@ -343,8 +358,15 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
-            plt.axhline(y=1, color="darkorange", linestyle="--",
-                        xmin=0.05, xmax=0.95, linewidth=3, label="Baseline")
+            plt.axhline(
+                y=1,
+                color="darkorange",
+                linestyle="--",
+                xmin=0.05,
+                xmax=0.95,
+                linewidth=3,
+                label="Baseline"
+            )
 
             # Legend
             ax.legend(loc="upper right")
@@ -405,8 +427,10 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)):
             plt.show()
 
     @staticmethod
-    def _find_optimal_cutoff(y_true: np.ndarray,
-                             y_pred: np.ndarray) -> float:
+    def _find_optimal_cutoff(
+        y_true: np.ndarray,
+        y_pred: np.ndarray
+    ) -> float:
         """Find the optimal probability cut off point for a classification model.
 
         Parameters
@@ -425,8 +449,11 @@ def _find_optimal_cutoff(y_true: np.ndarray,
         return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
 
     @staticmethod
-    def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray,
-                                thresholds: np.ndarray) -> float:
+    def _compute_optimal_cutoff(
+        fpr: np.ndarray,
+        tpr: np.ndarray,
+        thresholds: np.ndarray
+    ) -> float:
         """Calculate the optimal probability cut-off point for a classification model.
 
         The optimal cut-off would be where TPR is high and FPR is low, hence
@@ -454,8 +481,10 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray,
         return thresholds[optimal_index][0]
 
     @staticmethod
-    def _compute_cumulative_gains(y_true: np.ndarray,
-                                  y_pred: np.ndarray) -> tuple:
+    def _compute_cumulative_gains(
+        y_true: np.ndarray,
+        y_pred: np.ndarray
+    ) -> tuple:
         """Compute cumulative gains of the model.
 
         Code from (https://github.com/reiinakano/scikit-plot/blob/
@@ -514,10 +543,15 @@ def _compute_lift_per_bin(
         tuple
             Includes x-labels, lifts per decile, and target incidence.
         """
-        lifts = [ClassificationEvaluator._compute_lift(y_true=y_true,
-                                                       y_pred=y_pred,
-                                                       lift_at=perc_lift)
-                 for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)]
+        lifts = [
+            ClassificationEvaluator
+            ._compute_lift(
+                y_true=y_true,
+                y_pred=y_pred,
+                lift_at=perc_lift
+            )
+            for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)
+        ]
 
         x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)]
 
@@ -562,14 +596,14 @@ def _compute_lift(
         avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_))
 
         # Sort and filter data
-        data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0]
-                       .reshape(stop, 1))
+        data_sorted = (
+            y_data[y_data[:, 1].argsort()[::-1]][:stop, 0]
+            .reshape(stop, 1)
+        )
 
         # Calculate lift (einsum is a very fast way of summing, but needs specific shape)
         inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted))
-
         lift = np.round(inc_in_top_n/avg_incidence, 2)[0]
-
         return lift
 
 
@@ -617,8 +651,10 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
         self.qq = RegressionEvaluator._compute_qq_residuals(y_true, y_pred)
 
     @staticmethod
-    def _compute_scalar_metrics(y_true: np.ndarray,
-                                y_pred: np.ndarray) -> pd.Series:
+    def _compute_scalar_metrics(
+        y_true: np.ndarray,
+        y_pred: np.ndarray
+    ) -> pd.Series:
         """Compute various scalar performance measures.
 
         Parameters
@@ -645,8 +681,10 @@ def _compute_scalar_metrics(y_true: np.ndarray,
         })
 
     @staticmethod
-    def _compute_qq_residuals(y_true: np.ndarray,
-                              y_pred: np.ndarray) -> pd.Series:
+    def _compute_qq_residuals(
+        y_true: np.ndarray,
+        y_pred: np.ndarray
+    ) -> pd.Series:
         """Compute various scalar performance measures.
 
         Parameters
@@ -694,16 +732,16 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
             The instance is not fitted yet.
         """
         if self.y_true is None and self.y_pred is None:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         y_true = self.y_true
         y_pred = self.y_pred
 
         with plt.style.context("seaborn-whitegrid"):
-
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             x = np.arange(1, len(y_true)+1)
@@ -737,13 +775,13 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)):
             The instance is not fitted yet.
         """
         if self.qq is None:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         with plt.style.context("seaborn-whitegrid"):
-
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
             x = self.qq["quantiles"]
diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
index 5503349..e728dd0 100644
--- a/cobra/evaluation/pigs_tables.py
+++ b/cobra/evaluation/pigs_tables.py
@@ -38,10 +38,12 @@ def generate_pig_tables(
         DataFrame containing a PIG table for all predictors.
     """
     pigs = [
-        compute_pig_table(basetable,
-                          column_name,
-                          target_column_name,
-                          id_column_name)
+        compute_pig_table(
+            basetable,
+            column_name,
+            target_column_name,
+            id_column_name
+        )
         for column_name in sorted(preprocessed_predictors)
         if column_name not in [id_column_name, target_column_name]
     ]
@@ -49,10 +51,12 @@ def generate_pig_tables(
     return output
 
 
-def compute_pig_table(basetable: pd.DataFrame,
-                      predictor_column_name: str,
-                      target_column_name: str,
-                      id_column_name: str) -> pd.DataFrame:
+def compute_pig_table(
+    basetable: pd.DataFrame,
+    predictor_column_name: str,
+    target_column_name: str,
+    id_column_name: str
+) -> pd.DataFrame:
     """Compute the PIG table of a given predictor for a given target.
 
     Parameters
@@ -76,12 +80,17 @@ def compute_pig_table(basetable: pd.DataFrame,
     # group by the binned variable, compute the incidence
     # (=mean of the target for the given bin) and compute the bin size
     # (e.g. COUNT(id_column_name)). After that, rename the columns
-    res = (basetable.groupby(predictor_column_name)
-           .agg({target_column_name: "mean", id_column_name: "size"})
-           .reset_index()
-           .rename(columns={predictor_column_name: "label",
-                            target_column_name: "avg_target",
-                            id_column_name: "pop_size"}))
+    res = (
+        basetable
+        .groupby(predictor_column_name)
+        .agg({target_column_name: "mean", id_column_name: "size"})
+        .reset_index()
+        .rename(columns={
+            predictor_column_name: "label",
+            target_column_name: "avg_target",
+            id_column_name: "pop_size"
+        })
+    )
 
     # add the column name to a variable column
     # add the average incidence
@@ -136,9 +145,11 @@ def plot_incidence(
         the same set of variables.
     """
     if model_type not in ["classification", "regression"]:
-        raise ValueError("An unexpected value was set for the model_type "
-                         "parameter. Expected 'classification' or "
-                         "'regression'.")
+        raise ValueError(
+            "An unexpected value was set for the model_type "
+            "parameter. Expected 'classification' or "
+            "'regression'."
+        )
 
     df_plot = pig_tables[pig_tables['variable'] == variable].copy()
 
@@ -149,8 +160,10 @@ def plot_incidence(
                 'the same set of variables.')
 
         df_plot['label'] = df_plot['label'].astype('category')
-        df_plot['label'].cat.reorder_categories(column_order,
-                                                inplace=True)
+        df_plot['label'].cat.reorder_categories(
+            column_order,
+            inplace=True
+        )
 
         df_plot.sort_values(by=['label'], ascending=True, inplace=True)
         df_plot.reset_index(inplace=True)
@@ -179,12 +192,18 @@ def plot_incidence(
         ax.plot(np.nan, "#939598", linewidth=6, label='bin size')
 
         # Set labels & ticks
-        ax.set_ylabel('incidence' if model_type == "classification" else "mean target value",
-                      fontsize=16)
+        ax.set_ylabel(
+            'incidence' if model_type == "classification" else "mean target value",
+            fontsize=16
+        )
         ax.set_xlabel(f'{variable} bins' '', fontsize=16)
         ax.xaxis.set_tick_params(labelsize=14)
-        plt.setp(ax.get_xticklabels(),
-                 rotation=45, ha="right", rotation_mode="anchor")
+        plt.setp(
+            ax.get_xticklabels(),
+            rotation=45,
+            ha="right",
+            rotation_mode="anchor"
+        )
         ax.yaxis.set_tick_params(labelsize=14)
 
         if model_type == "classification":
@@ -247,9 +266,15 @@ def plot_incidence(
         else:
             title = "Mean target plot - " + variable
         fig.suptitle(title, fontsize=22)
-        ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
-                  loc=3, ncol=1, mode="expand", borderaxespad=0.,
-                  prop={"size": 14})
+        ax.legend(
+            frameon=False,
+            bbox_to_anchor=(0., 1.01, 1., .102),
+            loc=3,
+            ncol=1,
+            mode="expand",
+            borderaxespad=0.,
+            prop={"size": 14}
+        )
 
         # Set order of layers
         ax.set_zorder(1)
diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py
index 8f0a6b0..ae91220 100644
--- a/cobra/evaluation/plotting_utils.py
+++ b/cobra/evaluation/plotting_utils.py
@@ -40,13 +40,18 @@ def plot_univariate_predictor_quality(
         metric = "RMSE"
         ascending = True
 
-    df = (df_metric[df_metric["preselection"]]
-          .sort_values(by=metric+" selection", ascending=ascending))
-
-    df = pd.melt(df, id_vars=["predictor"],
-                 value_vars=[metric+" train", metric+" selection"],
-                 var_name="split",
-                 value_name=metric)
+    df = (
+        df_metric[df_metric["preselection"]]
+        .sort_values(by=metric+" selection", ascending=ascending)
+    )
+
+    df = pd.melt(
+        df,
+        id_vars=["predictor"],
+        value_vars=[metric+" train", metric+" selection"],
+        var_name="split",
+        value_name=metric
+    )
 
     # plot data
     with plt.style.context("seaborn-ticks"):
@@ -127,28 +132,39 @@ def plot_performance_curves(
         elif model_type == "regression":
             metric_name = "RMSE"
 
-    max_metric = np.round(max(max(model_performance['train_performance']),
-                              max(model_performance['selection_performance']),
-                              max(model_performance['validation_performance'])), 1)
+    max_metric = np.round(
+        max(
+            max(model_performance['train_performance']),
+            max(model_performance['selection_performance']),
+            max(model_performance['validation_performance'])
+        ), 1)
 
     with plt.style.context("seaborn-whitegrid"):
-
         fig, ax = plt.subplots(figsize=dim)
 
-        plt.plot(model_performance['train_performance'], marker=".",
-                 markersize=20, linewidth=3, label="train",
-                 color=colors["train"])
-        plt.plot(model_performance['selection_performance'], marker=".",
-                 markersize=20, linewidth=3, label="selection",
-                 color=colors["selection"])
-        plt.plot(model_performance['validation_performance'], marker=".",
-                 markersize=20, linewidth=3, label="validation",
-                 color=colors["validation"])
+        plt.plot(
+            model_performance['train_performance'], marker=".",
+            markersize=20, linewidth=3, label="train",
+            color=colors["train"]
+        )
+        plt.plot(
+            model_performance['selection_performance'], marker=".",
+            markersize=20, linewidth=3, label="selection",
+            color=colors["selection"]
+        )
+        plt.plot(
+            model_performance['validation_performance'], marker=".",
+            markersize=20, linewidth=3, label="validation",
+            color=colors["validation"]
+        )
 
         # Set x- and y-ticks
         ax.set_xticks(np.arange(len(model_performance['last_added_predictor'])))
-        ax.set_xticklabels(model_performance['last_added_predictor'].tolist(),
-                           rotation=40, ha='right')
+        ax.set_xticklabels(
+            model_performance['last_added_predictor'].tolist(),
+            rotation=40,
+            ha='right'
+        )
 
         if model_type == "classification":
             ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05))
@@ -160,9 +176,11 @@ def plot_performance_curves(
 
         # Make pretty
         ax.legend(loc='lower right')
-        fig.suptitle('Performance curves forward feature selection',
-                     fontsize=20)
-        plt.title("Metric: "+metric_name, fontsize=15, loc="left")
+        fig.suptitle(
+            'Performance curves forward feature selection',
+            fontsize=20
+        )
+        plt.title("Metric: " + metric_name, fontsize=15, loc="left")
         plt.ylabel('Model performance')
 
         if path is not None:
@@ -192,9 +210,11 @@ def plot_variable_importance(
     """
     with plt.style.context("seaborn-ticks"):
         fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-        ax = sns.barplot(x="importance", y="predictor",
-                         data=df_variable_importance,
-                         color="cornflowerblue")
+        ax = sns.barplot(
+            x="importance", y="predictor",
+            data=df_variable_importance,
+            color="cornflowerblue"
+        )
         if title:
             ax.set_title(title)
         else:
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index ed5f119..9b897d9 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -1,7 +1,7 @@
 """Feature forward selection."""
 
 import logging
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import pandas as pd
 from tqdm.auto import tqdm
@@ -15,6 +15,8 @@
 DEFAULT_FORCED_PREDICTORS = []
 DEFAULT_EXCLUDED_PREDICTORS = []
 
+Model = Union[LinearRegressionModel, LogisticRegressionModel, None]
+
 
 class ForwardFeatureSelection:
     """Perform forward feature selection for a given dataset using a given algorithm.
@@ -59,7 +61,7 @@ def __init__(
 
         self._fitted_models = []
 
-    def get_model_from_step(self, step: int):
+    def get_model_from_step(self, step: int) -> Model:
         """Get fitted model from a particular step.
 
         Parameters
@@ -78,8 +80,10 @@ def get_model_from_step(self, step: int):
             In case step is larger than the number of available models.
         """
         if len(self._fitted_models) <= step:
-            raise ValueError(f"No model available for step {step}. "
-                             "The first step starts from index 0.")
+            raise ValueError(
+                f"No model available for step {step}. "
+                "The first step starts from index 0."
+            )
 
         return self._fitted_models[step]
 
@@ -124,8 +128,10 @@ def compute_model_performances(
         predictor_set = set([])
 
         for model in self._fitted_models:
-            last_added_predictor = (set(model.predictors)
-                                    .difference(predictor_set))
+            last_added_predictor = (
+                set(model.predictors)
+                .difference(predictor_set)
+            )
             tmp = {
                 "predictors": model.predictors,
                 "last_added_predictor": list(last_added_predictor)[0]
@@ -144,7 +150,6 @@ def compute_model_performances(
             })
 
             results.append(tmp)
-
             predictor_set = predictor_set.union(set(model.predictors))
 
         df = pd.DataFrame(results)
@@ -191,27 +196,41 @@ def fit(
         # remove excluded predictors from predictor lists
         forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS
         excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS
-        filtered_predictors = [var for var in predictors
-                               if (var not in excluded_predictors and
-                                   var not in forced_predictors)]
+        filtered_predictors = [
+            var for var in predictors
+            if (
+                var not in excluded_predictors
+                and var not in forced_predictors
+            )
+        ]
 
         # checks on predictor lists and self.max_predictors attr
         if len(forced_predictors) > self.max_predictors:
-            raise ValueError("Size of forced_predictors cannot be bigger than "
-                             "max_predictors.")
+            raise ValueError(
+                "Size of forced_predictors cannot be bigger than "
+                "max_predictors."
+            )
         elif len(forced_predictors) == self.max_predictors:
-            log.info("Size of forced_predictors equals max_predictors "
-                     "only one model will be trained...")
+            log.info(
+                "Size of forced_predictors equals max_predictors "
+                "only one model will be trained..."
+            )
             # train model with all forced_predictors (only)
-            (self._fitted_models
-             .append(self._train_model(train_data[train_data["split"] == "train"],
-                                       target_column_name,
-                                       forced_predictors)))
+            self._fitted_models.append(
+                self._train_model(
+                    train_data[train_data["split"] == "train"],
+                    target_column_name,
+                    forced_predictors
+                )
+            )
+
         else:
-            self._fitted_models = self._forward_selection(train_data,
-                                                          target_column_name,
-                                                          filtered_predictors,
-                                                          forced_predictors)
+            self._fitted_models = self._forward_selection(
+                train_data,
+                target_column_name,
+                filtered_predictors,
+                forced_predictors
+            )
 
     def _forward_selection(
         self,
@@ -219,7 +238,7 @@ def _forward_selection(
         target_column_name: str,
         predictors: list,
         forced_predictors: list = None
-    ) -> list:
+    ) -> list[Model]:
         """Perform the forward feature selection algorithm.
 
         The algorithm will compute a list of models (with increasing performance).
@@ -251,26 +270,35 @@ def _forward_selection(
         max_steps = 1 + min(self.max_predictors,
                             len(predictors) + len(forced_predictors))
 
-        for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
-                                                   "predictor..."):
+        for step in tqdm(
+            range(1, max_steps),
+            desc="Sequentially adding best predictor..."
+        ):
             if step <= len(forced_predictors):
                 # first, we go through the forced predictors
-                candidate_predictors = [var for var in forced_predictors
-                                        if var not in current_predictors]
+                candidate_predictors = [
+                    var for var in forced_predictors
+                    if var not in current_predictors
+                ]
             else:
-                candidate_predictors = [var for var in (predictors
-                                                        + forced_predictors)
-                                        if var not in current_predictors]
-
-            model = self._find_next_best_model(train_data,
-                                               target_column_name,
-                                               candidate_predictors,
-                                               current_predictors)
+                candidate_predictors = [
+                    var for var in (predictors + forced_predictors)
+                    if var not in current_predictors
+                ]
+
+            model = self._find_next_best_model(
+                train_data,
+                target_column_name,
+                candidate_predictors,
+                current_predictors
+            )
 
             if model is not None:
                 # Add new model predictors to the list of current predictors
-                current_predictors = list(set(current_predictors)
-                                          .union(set(model.predictors)))
+                current_predictors = list(
+                    set(current_predictors)
+                    .union(set(model.predictors))
+                )
 
                 fitted_models.append(model)
             # else:
@@ -290,7 +318,7 @@ def _find_next_best_model(
         target_column_name: str,
         candidate_predictors: list,
         current_predictors: list
-    ):
+    ) -> Model:
         """
         Find the next best model with candidate predictors.
 
@@ -324,27 +352,36 @@ def _find_next_best_model(
         # placeholders
         best_model = None
         if self.MLModel == LogisticRegressionModel:
-            best_performance = -1  # AUC metric is used
+            best_performance = -1.0  # AUC metric is used
         elif self.MLModel == LinearRegressionModel:
             best_performance = float("inf")  # RMSE metric is used
         else:
-            raise ValueError("No metric comparison method has been configured "
-                             "for the given model_type specified as "
-                             "ForwardFeatureSelection argument.")
+            raise ValueError(
+                "No metric comparison method has been configured "
+                "for the given model_type specified as "
+                "ForwardFeatureSelection argument."
+            )
 
         fit_data = train_data[train_data["split"] == "train"]  # data to fit the models with
         sel_data = train_data[train_data["split"] == "selection"]  # data to compare the models with
 
         for pred in candidate_predictors:
             # Train a model with an additional predictor
-            model = self._train_model(fit_data, target_column_name,
-                                      (current_predictors + [pred]))
+            model = self._train_model(
+                fit_data,
+                target_column_name,
+                (current_predictors + [pred])
+            )
 
             # Evaluate the model
-            performance = (model
-                           .evaluate(sel_data[current_predictors + [pred]],
-                                     sel_data[target_column_name],
-                                     split="selection"))
+            performance = (
+                model
+                .evaluate(
+                    sel_data[current_predictors + [pred]],
+                    sel_data[target_column_name],
+                    split="selection"
+                )
+            )
 
             if self.pos_only and (not (model.get_coef() >= 0).all()):
                 continue
@@ -362,8 +399,12 @@ def _find_next_best_model(
 
         return best_model
 
-    def _train_model(self, train_data: pd.DataFrame, target_column_name: str,
-                     predictors: list):
+    def _train_model(
+        self,
+        train_data: pd.DataFrame,
+        target_column_name: str,
+        predictors: list
+    ) -> Model:
         """Train the model with a given set of predictors.
 
         Parameters
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index cad6381..58571b3 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -233,8 +233,10 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
                                     orient="index").reset_index()
         df.columns = ["predictor", "importance"]
 
-        return (df.sort_values(by="importance", ascending=False)
-                .reset_index(drop=True))
+        return (
+            df.sort_values(by="importance", ascending=False)
+            .reset_index(drop=True)
+        )
 
     def _is_valid_dict(self, model_dict: dict) -> bool:
         """Check if the model dictionary is valid."""
@@ -456,12 +458,16 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
             for predictor in self.predictors
         }
 
-        df = pd.DataFrame.from_dict(importance_by_variable,
-                                    orient="index").reset_index()
+        df = pd.DataFrame.from_dict(
+            importance_by_variable,
+            orient="index"
+        ).reset_index()
         df.columns = ["predictor", "importance"]
 
-        return (df.sort_values(by="importance", ascending=False)
-                .reset_index(drop=True))
+        return (
+            df.sort_values(by="importance", ascending=False)
+            .reset_index(drop=True)
+        )
 
     @staticmethod
     def _is_valid_dict(model_dict: dict) -> bool:
diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 48c960b..e4d1ff6 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -81,9 +81,13 @@ def compute_univariate_preselection(
                 y_true=target_enc_selection_data[target_column],
                 y_score=target_enc_selection_data[predictor])
 
-            result.append({"predictor": cleaned_predictor,
-                           "AUC train": auc_train,
-                           "AUC selection": auc_selection})
+            result.append(
+                {
+                    "predictor": cleaned_predictor,
+                    "AUC train": auc_train,
+                    "AUC selection": auc_selection
+                }
+            )
 
         df_auc = pd.DataFrame(result)
 
@@ -92,8 +96,8 @@ def compute_univariate_preselection(
 
         # Identify those variables for which the AUC difference between train
         # and selection is within a user-defined ratio
-        auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
-                         < preselect_overtrain_threshold)
+        preselect_overtrain = df_auc["AUC train"] - df_auc["AUC selection"]
+        auc_overtrain = preselect_overtrain < preselect_overtrain_threshold
 
         df_auc["preselection"] = auc_thresh & auc_overtrain
 
@@ -111,9 +115,13 @@ def compute_univariate_preselection(
                 y_true=target_enc_selection_data[target_column],
                 y_pred=target_enc_selection_data[predictor]))
 
-            result.append({"predictor": cleaned_predictor,
-                           "RMSE train": rmse_train,
-                           "RMSE selection": rmse_selection})
+            result.append(
+                {
+                    "predictor": cleaned_predictor,
+                    "RMSE train": rmse_train,
+                    "RMSE selection": rmse_selection
+                }
+            )
 
         df_rmse = pd.DataFrame(result)
 
@@ -122,8 +130,8 @@ def compute_univariate_preselection(
 
         # Identify those variables for which the RMSE difference between train
         # and selection is within a user-defined ratio
-        rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"])  # flip subtraction vs. AUC
-                          < preselect_overtrain_threshold)
+        preselect_overtrain = df_rmse["RMSE selection"] - df_rmse["RMSE train"]  # flip subtraction vs. AUC
+        rmse_overtrain = preselect_overtrain < preselect_overtrain_threshold
 
         df_rmse["preselection"] = rmse_thresh & rmse_overtrain
 
@@ -148,19 +156,25 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
         List of preselected predictors.
     """
     if "AUC selection" in df_metric.columns:
-        predictor_list = (df_metric[df_metric["preselection"]]
-                          .sort_values(by="AUC selection", ascending=False)
-                          .predictor.tolist())
+        predictor_list = (
+            df_metric[df_metric["preselection"]]
+            .sort_values(by="AUC selection", ascending=False)
+            .predictor.tolist()
+        )
     elif "RMSE selection" in df_metric.columns:
-        predictor_list = (df_metric[df_metric["preselection"]]
-                          .sort_values(by="RMSE selection", ascending=True)  # lower is better
-                          .predictor.tolist())
+        predictor_list = (
+            df_metric[df_metric["preselection"]]
+            .sort_values(by="RMSE selection", ascending=True)  # lower is better
+            .predictor.tolist()
+        )
 
     return [col + "_enc" for col in predictor_list]
 
 
-def compute_correlations(target_enc_train_data: pd.DataFrame,
-                         predictors: list) -> pd.DataFrame:
+def compute_correlations(
+    target_enc_train_data: pd.DataFrame,
+    predictors: list
+) -> pd.DataFrame:
     """Compute the correlations amongst the predictors in the DataFrame.
 
     Parameters
@@ -178,8 +192,10 @@ def compute_correlations(target_enc_train_data: pd.DataFrame,
     """
     correlations = target_enc_train_data[predictors].corr()
 
-    predictors_cleaned = [utils.clean_predictor_name(predictor)
-                          for predictor in predictors]
+    predictors_cleaned = [
+        utils.clean_predictor_name(predictor)
+        for predictor in predictors
+    ]
 
     # Change index and columns with the cleaned version of the predictors
     # e.g. change "var1_enc" with "var1"
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index bf60079..9d2f263 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -61,9 +61,11 @@ class CategoricalDataProcessor(BaseEstimator):
         Whether contingency table should be scaled before chi^2.
     """
 
-    valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing",
-                  "category_size_threshold", "p_value_threshold",
-                  "scale_contingency_table", "forced_categories"]
+    valid_keys = [
+        "model_type", "regroup", "regroup_name", "keep_missing",
+        "category_size_threshold", "p_value_threshold",
+        "scale_contingency_table", "forced_categories"
+    ]
 
     def __init__(
         self,
@@ -81,7 +83,7 @@ def __init__(
             raise ValueError(
                 "An unexpected model_type was provided. "
                 "A valid model_type is either 'classification' or 'regression'."
-                )
+            )
 
         self.model_type = model_type
         self.regroup = regroup
@@ -130,9 +132,11 @@ def set_attributes_from_dict(self, params: dict):
         _fitted_output = params.pop("_cleaned_categories_by_column", {})
 
         if type(_fitted_output) != dict:
-            raise ValueError("_cleaned_categories_by_column is expected to "
-                             "be a dict but is of type {} instead"
-                             .format(type(_fitted_output)))
+            raise ValueError(
+                "_cleaned_categories_by_column is expected to "
+                "be a dict but is of type {} instead"
+                .format(type(_fitted_output))
+            )
 
         # Clean out params dictionary to remove unknown keys (for safety!)
         params = {key: params[key] for key in params if key in self.valid_keys}
@@ -147,8 +151,12 @@ def set_attributes_from_dict(self, params: dict):
 
         return self
 
-    def fit(self, data: pd.DataFrame, column_names: list,
-            target_column: str):
+    def fit(
+        self,
+        data: pd.DataFrame,
+        column_names: list,
+        target_column: str
+    ):
         """Fit the CategoricalDataProcessor.
 
         Parameters
@@ -166,12 +174,15 @@ def fit(self, data: pd.DataFrame, column_names: list,
             log.info("regroup was set to False, so no fitting is required")
             return None
 
-        for column_name in tqdm(column_names, desc="Fitting category "
-                                                   "regrouping..."):
-
+        for column_name in tqdm(
+            column_names,
+            desc="Fitting category regrouping..."
+        ):
             if column_name not in data.columns:
-                log.warning("DataFrame has no column '{}', so it will be "
-                            "skipped in fitting" .format(column_name))
+                log.warning(
+                    "DataFrame has no column '{}', so it will be "
+                    "skipped in fitting" .format(column_name)
+                )
                 continue
 
             cleaned_cats = self._fit_column(data, column_name, target_column)
@@ -220,9 +231,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
         combined_categories = set()
 
         # replace missings and get unique categories as a list
-        X = (CategoricalDataProcessor
-             ._replace_missings(data[column_name])
-             .astype(object))
+        X = (
+            CategoricalDataProcessor
+            ._replace_missings(data[column_name])
+            .astype(object)
+        )
 
         unique_categories = list(X.unique())
 
@@ -235,21 +248,28 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
 
         # get small categories and add them to the merged category list
         # does not apply incidence factor when model_type = "regression"
-        small_categories = (CategoricalDataProcessor
-                            ._get_small_categories(
-                                X,
-                                incidence,
-                                self.category_size_threshold))
+        small_categories = (
+            CategoricalDataProcessor
+            ._get_small_categories(
+                X,
+                incidence,
+                self.category_size_threshold
+            )
+        )
         combined_categories = combined_categories.union(small_categories)
 
         for category in unique_categories:
             if category in small_categories:
                 continue
 
-            pval = (CategoricalDataProcessor
-                    ._compute_p_value(X, y, category,
-                                      model_type,
-                                      self.scale_contingency_table))
+            pval = (
+                CategoricalDataProcessor
+                ._compute_p_value(
+                    X, y, category,
+                    model_type,
+                    self.scale_contingency_table
+                )
+            )
 
             # if not significant, add it to the list
             if pval > self.p_value_threshold:
@@ -261,8 +281,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
 
         return set(unique_categories).difference(combined_categories)
 
-    def transform(self, data: pd.DataFrame,
-                  column_names: list) -> pd.DataFrame:
+    def transform(
+        self,
+        data: pd.DataFrame,
+        column_names: list
+    ) -> pd.DataFrame:
         """Transform the data.
 
         Parameters
@@ -279,24 +302,26 @@ def transform(self, data: pd.DataFrame,
             Data with additional transformed variables.
         """
         if self.regroup and len(self._cleaned_categories_by_column) == 0:
-            msg = ("{} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "{} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column_name in column_names:
 
             if column_name not in data.columns:
-                log.warning("Unknown column '{}' will be skipped"
-                            .format(column_name))
+                log.warning("Unknown column '{}' will be skipped".format(column_name))
                 continue
 
             data = self._transform_column(data, column_name)
 
         return data
 
-    def _transform_column(self, data: pd.DataFrame,
-                          column_name: str) -> pd.DataFrame:
+    def _transform_column(
+        self, data: pd.DataFrame,
+        column_name: str
+    ) -> pd.DataFrame:
         """Create an additional column which combines categories into "Other".
 
         Parameters
@@ -315,11 +340,13 @@ def _transform_column(self, data: pd.DataFrame,
         data.loc[:, column_name_clean] = data[column_name].astype(object)
 
         # Fill missings first
-        data.loc[:, column_name_clean] = (CategoricalDataProcessor
-                                          ._replace_missings(
-                                              data,
-                                              column_name_clean
-                                              ))
+        data.loc[:, column_name_clean] = (
+            CategoricalDataProcessor
+            ._replace_missings(
+                data,
+                column_name_clean
+            )
+        )
 
         if self.regroup:
             categories = self._cleaned_categories_by_column.get(column_name)
@@ -332,20 +359,26 @@ def _transform_column(self, data: pd.DataFrame,
                                 "and will be skipped".format(column_name))
                 return data
 
-            data.loc[:, column_name_clean] = (CategoricalDataProcessor
-                                              ._replace_categories(
-                                                  data[column_name_clean],
-                                                  categories,
-                                                  self.regroup_name))
+            data.loc[:, column_name_clean] = (
+                CategoricalDataProcessor
+                ._replace_categories(
+                    data[column_name_clean],
+                    categories,
+                    self.regroup_name
+                )
+            )
 
         # change data to categorical
-        data.loc[:, column_name_clean] = (data[column_name_clean]
-                                          .astype("category"))
+        data.loc[:, column_name_clean] = data[column_name_clean].astype("category")
 
         return data
 
-    def fit_transform(self, data: pd.DataFrame, column_names: list,
-                      target_column: str) -> pd.DataFrame:
+    def fit_transform(
+        self,
+        data: pd.DataFrame,
+        column_names: list,
+        target_column: str
+    ) -> pd.DataFrame:
         """Fit and transform the data.
 
         Parameters
@@ -367,9 +400,11 @@ def fit_transform(self, data: pd.DataFrame, column_names: list,
         return self.transform(data, column_names)
 
     @staticmethod
-    def _get_small_categories(predictor_series: pd.Series,
-                              incidence: float,
-                              category_size_threshold: int) -> set:
+    def _get_small_categories(
+        predictor_series: pd.Series,
+        incidence: float,
+        category_size_threshold: int
+    ) -> set:
         """
         Fetch categories with a size below a certain threshold.
 
@@ -400,8 +435,10 @@ def _get_small_categories(predictor_series: pd.Series,
         return set(category_counts[bool_mask].index.tolist())
 
     @staticmethod
-    def _replace_missings(data: pd.DataFrame,
-                          column_names: Optional[list] = None) -> pd.DataFrame:
+    def _replace_missings(
+        data: pd.DataFrame,
+        column_names: Optional[list] = None
+    ) -> pd.DataFrame:
         """Replace missing values (incl. empty strings).
 
         Parameters
@@ -431,9 +468,13 @@ def _replace_missings(data: pd.DataFrame,
         return temp
 
     @staticmethod
-    def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
-                         model_type: str,
-                         scale_contingency_table: bool) -> float:
+    def _compute_p_value(
+        X: pd.Series,
+        y: pd.Series,
+        category: str,
+        model_type: str,
+        scale_contingency_table: bool
+    ) -> float:
         """
         Calculate p-value.
 
@@ -483,14 +524,19 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
             pval = stats.chi2_contingency(contingency_table, correction=False)[1]
 
         elif model_type == "regression":
-            pval = stats.kruskal(df.y[df.other_categories == 0],
-                                 df.y[df.other_categories == 1])[1]
+            pval = stats.kruskal(
+                df.y[df.other_categories == 0],
+                df.y[df.other_categories == 1]
+            )[1]
 
         return pval
 
     @staticmethod
-    def _replace_categories(data: pd.Series, categories: set,
-                            replace_with: str) -> pd.Series:
+    def _replace_categories(
+        data: pd.Series,
+        categories: set,
+        replace_with: str
+    ) -> pd.Series:
         """
         Replace categories in set with "Other".
 
diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py
index 1a903d9..7621ac8 100644
--- a/cobra/preprocessing/kbins_discretizer.py
+++ b/cobra/preprocessing/kbins_discretizer.py
@@ -1,6 +1,7 @@
 """Binning of continous data."""
 # standard lib imports
 from copy import deepcopy
+from this import d
 from typing import List
 import numbers
 import logging
@@ -63,9 +64,11 @@ class KBinsDiscretizer(BaseEstimator):
     """
 
     valid_strategies = ("uniform", "quantile")
-    valid_keys = ["n_bins", "strategy", "closed", "auto_adapt_bins",
-                  "starting_precision", "label_format",
-                  "change_endpoint_format"]
+    valid_keys = [
+        "n_bins", "strategy", "closed", "auto_adapt_bins",
+        "starting_precision", "label_format",
+        "change_endpoint_format"
+    ]
 
     def __init__(
         self, n_bins: int = 10, strategy: str = "quantile",
@@ -104,14 +107,21 @@ def _validate_n_bins(self, n_bins: int):
             in case ``n_bins`` is not an integer or if ``n_bins < 2``
         """
         if not isinstance(n_bins, numbers.Integral):
-            raise ValueError("{} received an invalid n_bins type. "
-                             "Received {}, expected int."
-                             .format(KBinsDiscretizer.__name__,
-                                     type(n_bins).__name__))
+            raise ValueError(
+                "{} received an invalid n_bins type. Received {}, expected int."
+                .format(
+                    KBinsDiscretizer.__name__,
+                    type(n_bins).__name__
+                )
+            )
         if n_bins < 2:
-            raise ValueError("{} received an invalid number "
-                             "of bins. Received {}, expected at least 2."
-                             .format(KBinsDiscretizer.__name__, n_bins))
+            raise ValueError(
+                "{} received an invalid number of bins. Received {}, expected at least 2."
+                .format(
+                    KBinsDiscretizer.__name__,
+                    n_bins
+                )
+            )
 
     def attributes_to_dict(self) -> dict:
         """Return the attributes of KBinsDiscretizer as a dictionary.
@@ -148,9 +158,10 @@ def set_attributes_from_dict(self, params: dict):
         _bins_by_column = params.pop("_bins_by_column", {})
 
         if type(_bins_by_column) != dict:
-            raise ValueError("_bins_by_column is expected to be a dict "
-                             "but is of type {} instead"
-                             .format(type(_bins_by_column)))
+            raise ValueError(
+                "_bins_by_column is expected to be a dict but is of type {} instead"
+                .format(type(_bins_by_column))
+            )
 
         # Clean out params dictionary to remove unknown keys (for safety!)
         params = {key: params[key] for key in params if key in self.valid_keys}
@@ -177,17 +188,23 @@ def fit(self, data: pd.DataFrame, column_names: list):
             Names of the columns of the DataFrame to discretize
         """
         if self.strategy not in self.valid_strategies:
-            raise ValueError("{}: valid options for 'strategy' are {}. "
-                             "Got strategy={!r} instead."
-                             .format(KBinsDiscretizer.__name__,
-                                     self.valid_strategies, self.strategy))
-
-        for column_name in tqdm(column_names, desc="Computing "
-                                                   "discretization bins..."):
-
+            raise ValueError(
+                "{}: valid options for 'strategy' are {}. Got strategy={!r} instead."
+                .format(
+                    KBinsDiscretizer.__name__,
+                    self.valid_strategies, self.strategy
+                )
+            )
+
+        for column_name in tqdm(
+            column_names, desc="Computing discretization bins..."
+        ):
             if column_name not in data.columns:
-                log.warning("DataFrame has no column '{}', so it will be "
-                            "skipped in fitting" .format(column_name))
+                log.warning(
+                    "DataFrame has no column '{}', so it will be "
+                    "skipped in fitting"
+                    .format(column_name)
+                )
                 continue
 
             bins = self._fit_column(data, column_name)
@@ -195,8 +212,11 @@ def fit(self, data: pd.DataFrame, column_names: list):
             # Add to bins_by_column for later use
             self._bins_by_column[column_name] = bins
 
-    def _fit_column(self, data: pd.DataFrame,
-                    column_name: str) -> List[tuple]:
+    def _fit_column(
+        self,
+        data: pd.DataFrame,
+        column_name: str
+    ) -> List[tuple]:
         """Compute bins for a specific column in data.
 
         Parameters
@@ -214,25 +234,31 @@ def _fit_column(self, data: pd.DataFrame,
         col_min, col_max = data[column_name].min(), data[column_name].max()
 
         if col_min == col_max:
-            log.warning("Predictor '{}' is constant and "
-                        "will be ignored in computation".format(column_name))
+            log.warning(
+                "Predictor '{}' is constant and will be ignored in computation"
+                .format(column_name)
+            )
             return None
 
         prop_inf = (np.sum(np.isinf(data[column_name]))
                     / data[column_name].shape[0])
 
         if prop_inf > 0:
-            log.warning(f"Column {column_name} has "
-                        f"{prop_inf:.1%} inf values, thus it was skipped. "
-                        f"Consider dropping or transforming it.")
+            log.warning(
+                f"Column {column_name} has "
+                f"{prop_inf:.1%} inf values, thus it was skipped. "
+                f"Consider dropping or transforming it."
+            )
             return None
 
         prop_nan = data[column_name].isna().sum() / data[column_name].shape[0]
 
         if prop_nan >= 0.99:
-            log.warning(f"Column {column_name} is"
-                        f" {prop_nan:.1%}% NaNs, "
-                        f"consider dropping or transforming it.")
+            log.warning(
+                f"Column {column_name} is"
+                f" {prop_nan:.1%}% NaNs, "
+                f"consider dropping or transforming it."
+            )
 
         n_bins = self.n_bins
         if self.auto_adapt_bins:
@@ -240,23 +266,37 @@ def _fit_column(self, data: pd.DataFrame,
             missing_pct = data[column_name].isnull().sum()/size
             n_bins = int(max(round((1 - missing_pct) * n_bins), 2))
 
-        bin_edges = self._compute_bin_edges(data, column_name, n_bins,
-                                            col_min, col_max)
+        bin_edges = self._compute_bin_edges(
+            data,
+            column_name,
+            n_bins,
+            col_min,
+            col_max
+        )
 
         if len(bin_edges) < 3:
-            log.warning("Only 1 bin was found for predictor '{}' so it will "
-                        "be ignored in computation".format(column_name))
+            log.warning(
+                "Only 1 bin was found for predictor '{}' so it will "
+                "be ignored in computation"
+                .format(column_name)
+            )
             return None
 
         if len(bin_edges) < n_bins + 1:
-            log.warning("The number of actual bins for predictor '{}' is {} "
-                        "which is smaller than the requested number of bins "
-                        "{}".format(column_name, len(bin_edges) - 1, n_bins))
+            log.warning(
+                "The number of actual bins for predictor '{}' is {} "
+                "which is smaller than the requested number of bins "
+                "{}"
+                .format(column_name, len(bin_edges) - 1, n_bins)
+            )
 
         return self._compute_bins_from_edges(bin_edges)
 
-    def transform(self, data: pd.DataFrame,
-                  column_names: list) -> pd.DataFrame:
+    def transform(
+        self,
+        data: pd.DataFrame,
+        column_names: list
+    ) -> pd.DataFrame:
         """Discretize the data in the given list of columns.
 
         This is done by mapping each number to
@@ -275,9 +315,10 @@ def transform(self, data: pd.DataFrame,
             data with additional discretized variables
         """
         if len(self._bins_by_column) == 0:
-            msg = ("{} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "{} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column_name in tqdm(column_names, desc="Discretizing columns..."):
@@ -293,9 +334,11 @@ def transform(self, data: pd.DataFrame,
 
         return data
 
-    def _transform_column(self, data: pd.DataFrame,
-                          column_name: str,
-                          bins: List[tuple]) -> pd.DataFrame:
+    def _transform_column(
+        self, data: pd.DataFrame,
+        column_name: str,
+        bins: List[tuple]
+    ) -> pd.DataFrame:
         """Create a new column with binned values of column_name.
 
         Parameters
@@ -317,14 +360,18 @@ def _transform_column(self, data: pd.DataFrame,
         column_name_bin = column_name + "_bin"
 
         # use pd.cut to compute bins
-        data.loc[:, column_name_bin] = pd.cut(x=data[column_name],
-                                              bins=interval_idx)
+        data.loc[:, column_name_bin] = pd.cut(
+            x=data[column_name],
+            bins=interval_idx
+        )
 
         # Rename bins so that the output has a proper format
         bin_labels = self._create_bin_labels(bins)
 
-        data.loc[:, column_name_bin] = (data[column_name_bin]
-                                        .cat.rename_categories(bin_labels))
+        data.loc[:, column_name_bin] = (
+            data[column_name_bin]
+            .cat.rename_categories(bin_labels)
+        )
 
         if data[column_name_bin].isnull().sum() > 0:
 
@@ -337,8 +384,11 @@ def _transform_column(self, data: pd.DataFrame,
 
         return data
 
-    def fit_transform(self, data: pd.DataFrame,
-                      column_names: list) -> pd.DataFrame:
+    def fit_transform(
+        self,
+        data: pd.DataFrame,
+        column_names: list
+    ) -> pd.DataFrame:
         """Fit to data, then transform it.
 
         Parameters
@@ -356,9 +406,14 @@ def fit_transform(self, data: pd.DataFrame,
         self.fit(data, column_names)
         return self.transform(data, column_names)
 
-    def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
-                           n_bins: int, col_min: float,
-                           col_max: float) -> list:
+    def _compute_bin_edges(
+        self,
+        data: pd.DataFrame,
+        column_name: str,
+        n_bins: int,
+        col_min: float,
+        col_max: float
+    ) -> list:
         """Compute the desired bin edges.
 
         Parameters
@@ -381,9 +436,13 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
         """
         bin_edges = []
         if self.strategy == "quantile":
-            bin_edges = list(data[column_name]
-                             .quantile(np.linspace(0, 1, n_bins + 1),
-                                       interpolation="linear"))
+            bin_edges = list(
+                data[column_name]
+                .quantile(
+                    np.linspace(0, 1, n_bins + 1),
+                    interpolation="linear"
+                )
+            )
         elif self.strategy == "uniform":
             bin_edges = list(np.linspace(col_min, col_max, n_bins + 1))
 
@@ -397,8 +456,9 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
             bin_edges[-1] = np.inf
 
         if np.isnan(bin_edges).sum() > 0:
-            log.warning(f"Column {column_name} "
-                        "has NaNs present in bin definitions")
+            log.warning(
+                f"Column {column_name} has NaNs present in bin definitions"
+            )
 
         # Make absolutely sure bin edges are ordered,
         # in very rare situations this wasn't the case
@@ -499,10 +559,14 @@ def _create_index(
         """
         # check if closed is of the proper form
         if closed not in ["left", "right"]:
-            raise ValueError("{}: valid options for 'closed' are {}. "
-                             "Got strategy={!r} instead."
-                             .format(KBinsDiscretizer.__name__,
-                                     ["left", "right"], closed))
+            raise ValueError(
+                "{}: valid options for 'closed' are {}. "
+                "Got strategy={!r} instead."
+                .format(
+                    KBinsDiscretizer.__name__,
+                    ["left", "right"], closed
+                )
+            )
 
         # deepcopy variable because we do not want to modify the content
         # of intervals (which is still used outside of this function)
@@ -530,8 +594,13 @@ def _create_bin_labels(self, bins: List[tuple]) -> list:
         """
         bin_labels = []
         for interval in bins:
-            bin_labels.append(self.label_format.format(interval[0],
-                                                       interval[1]))
+            bin_labels.append(
+                self.label_format
+                .format(
+                    interval[0],
+                    interval[1]
+                )
+            )
 
         # Format first and last bin as < x and > y resp.
         if self.change_endpoint_format:
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index 5aa9bda..64c0fa9 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -156,19 +156,23 @@ def from_params(
             Class encapsulating CategoricalDataProcessor,
             KBinsDiscretizer, and TargetEncoder instances.
         """
-        categorical_data_processor = CategoricalDataProcessor(model_type,
-                                                              regroup,
-                                                              regroup_name, keep_missing,
-                                                              category_size_threshold,
-                                                              p_value_threshold,
-                                                              scale_contingency_table,
-                                                              forced_categories)
-
-        discretizer = KBinsDiscretizer(n_bins, strategy, closed,
-                                       auto_adapt_bins,
-                                       starting_precision,
-                                       label_format,
-                                       change_endpoint_format)
+        categorical_data_processor = CategoricalDataProcessor(
+            model_type,
+            regroup,
+            regroup_name, keep_missing,
+            category_size_threshold,
+            p_value_threshold,
+            scale_contingency_table,
+            forced_categories
+        )
+
+        discretizer = KBinsDiscretizer(
+            n_bins, strategy, closed,
+            auto_adapt_bins,
+            starting_precision,
+            label_format,
+            change_endpoint_format
+        )
 
         target_encoder = TargetEncoder(weight, imputation_strategy)
 
@@ -199,8 +203,10 @@ def from_pipeline(cls, pipeline: dict):
             and no others.
         """
         if not PreProcessor._is_valid_pipeline(pipeline):
-            raise ValueError("Invalid pipeline, as it does not "
-                             "contain all and only the required parameters.")
+            raise ValueError(
+                "Invalid pipeline, as it does not "
+                "contain all and only the required parameters."
+            )
 
         categorical_data_processor = CategoricalDataProcessor()
         categorical_data_processor.set_attributes_from_dict(
@@ -214,11 +220,20 @@ def from_pipeline(cls, pipeline: dict):
         target_encoder = TargetEncoder()
         target_encoder.set_attributes_from_dict(pipeline["target_encoder"])
 
-        return cls(categorical_data_processor, discretizer, target_encoder,
-                   is_fitted=pipeline["_is_fitted"])
+        return cls(
+            categorical_data_processor,
+            discretizer,
+            target_encoder,
+            is_fitted=pipeline["_is_fitted"]
+        )
 
-    def fit(self, train_data: pd.DataFrame, continuous_vars: list,
-            discrete_vars: list, target_column_name: str):
+    def fit(
+        self,
+        train_data: pd.DataFrame,
+        continuous_vars: list,
+        discrete_vars: list,
+        target_column_name: str
+    ):
         """Fit the data to the preprocessing pipeline.
 
         Parameters
@@ -233,9 +248,13 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list,
             Column name of the target.
         """
         # get list of all variables
-        preprocessed_variable_names = (PreProcessor
-                                       ._get_variable_list(continuous_vars,
-                                                           discrete_vars))
+        preprocessed_variable_names = (
+            PreProcessor
+            ._get_variable_list(
+                continuous_vars,
+                discrete_vars
+            )
+        )
 
         log.info("Starting to fit pipeline")
         start = time.time()
@@ -249,35 +268,55 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list,
         if continuous_vars:
             begin = time.time()
             self._discretizer.fit(train_data, continuous_vars)
-            log.info("Fitting KBinsDiscretizer took {} seconds"
-                     .format(time.time() - begin))
-
-            train_data = self._discretizer.transform(train_data,
-                                                     continuous_vars)
+            log.info(
+                "Fitting KBinsDiscretizer took {} seconds"
+                .format(time.time() - begin)
+            )
+
+            train_data = self._discretizer.transform(
+                train_data,
+                continuous_vars
+            )
         if discrete_vars:
             begin = time.time()
-            self._categorical_data_processor.fit(train_data,
-                                                 discrete_vars,
-                                                 target_column_name)
-            log.info("Fitting categorical_data_processor class took {} seconds"
-                     .format(time.time() - begin))
-
-            train_data = (self._categorical_data_processor
-                          .transform(train_data, discrete_vars))
+            self._categorical_data_processor.fit(
+                train_data,
+                discrete_vars,
+                target_column_name
+            )
+            log.info(
+                "Fitting categorical_data_processor class took {} seconds"
+                .format(time.time() - begin)
+            )
+
+            train_data = (
+                self._categorical_data_processor
+                .transform(train_data, discrete_vars)
+            )
 
         begin = time.time()
-        self._target_encoder.fit(train_data, preprocessed_variable_names,
-                                 target_column_name)
-        log.info("Fitting TargetEncoder took {} seconds"
-                 .format(time.time() - begin))
+        self._target_encoder.fit(
+            train_data, preprocessed_variable_names,
+            target_column_name
+        )
+        log.info(
+            "Fitting TargetEncoder took {} seconds"
+            .format(time.time() - begin)
+        )
 
         self._is_fitted = True  # set fitted boolean to True
 
-        log.info("Fitting pipeline took {} seconds"
-                 .format(time.time() - start))
+        log.info(
+            "Fitting pipeline took {} seconds"
+            .format(time.time() - start)
+        )
 
-    def transform(self, data: pd.DataFrame, continuous_vars: list,
-                  discrete_vars: list) -> pd.DataFrame:
+    def transform(
+        self,
+        data: pd.DataFrame,
+        continuous_vars: list,
+        discrete_vars: list
+    ) -> pd.DataFrame:
         """Transform the data by applying the preprocessing pipeline.
 
         Parameters
@@ -302,33 +341,48 @@ def transform(self, data: pd.DataFrame, continuous_vars: list,
         start = time.time()
 
         if not self._is_fitted:
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
-
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
-        preprocessed_variable_names = (PreProcessor
-                                       ._get_variable_list(continuous_vars,
-                                                           discrete_vars))
+        preprocessed_variable_names = (
+            PreProcessor
+            ._get_variable_list(
+                continuous_vars,
+                discrete_vars
+            )
+        )
 
         if continuous_vars:
             data = self._discretizer.transform(data, continuous_vars)
 
         if discrete_vars:
-            data = self._categorical_data_processor.transform(data,
-                                                              discrete_vars)
-
-        data = self._target_encoder.transform(data,
-                                              preprocessed_variable_names)
+            data = self._categorical_data_processor.transform(
+                data,
+                discrete_vars
+            )
+
+        data = self._target_encoder.transform(
+            data,
+            preprocessed_variable_names
+        )
 
-        log.info("Transforming data took {} seconds"
-                 .format(time.time() - start))
+        log.info(
+            "Transforming data took {} seconds"
+            .format(time.time() - start)
+        )
 
         return data
 
-    def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list,
-                      discrete_vars: list,
-                      target_column_name: str) -> pd.DataFrame:
+    def fit_transform(
+        self,
+        train_data: pd.DataFrame,
+        continuous_vars: list,
+        discrete_vars: list,
+        target_column_name: str
+    ) -> pd.DataFrame:
         """Fit preprocessing pipeline and transform the data.
 
         Parameters
@@ -347,8 +401,12 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list,
         pd.DataFrame
             Transformed (preprocessed) data.
         """
-        self.fit(train_data, continuous_vars, discrete_vars,
-                 target_column_name)
+        self.fit(
+            train_data,
+            continuous_vars,
+            discrete_vars,
+            target_column_name
+        )
 
         return self.transform(train_data, continuous_vars, discrete_vars)
 
@@ -382,8 +440,10 @@ def train_selection_validation_split(
             DataFrame with additional split column.
         """
         if not math.isclose(train_prop + selection_prop + validation_prop, 1.0):
-            raise ValueError("The sum of train_prop, selection_prop and "
-                             "validation_prop must be 1.0.")
+            raise ValueError(
+                "The sum of train_prop, selection_prop and "
+                "validation_prop must be 1.0."
+            )
 
         if train_prop == 0.0:
             raise ValueError("train_prop cannot be zero!")
@@ -428,13 +488,17 @@ def serialize_pipeline(self) -> dict:
             }
         }
 
-        pipeline["categorical_data_processor"] = (self
-                                                  ._categorical_data_processor
-                                                  .attributes_to_dict())
+        pipeline["categorical_data_processor"] = (
+            self
+            ._categorical_data_processor
+            .attributes_to_dict()
+        )
 
         pipeline["discretizer"] = self._discretizer.attributes_to_dict()
-        pipeline["target_encoder"] = (self._target_encoder
-                                      .attributes_to_dict())
+        pipeline["target_encoder"] = (
+            self._target_encoder
+            .attributes_to_dict()
+        )
 
         pipeline["_is_fitted"] = True
 
@@ -450,13 +514,20 @@ def _is_valid_pipeline(pipeline: dict) -> bool:
             Loaded pipeline from JSON file.
         """
         keys = inspect.getfullargspec(PreProcessor.from_params).args
-        valid_keys = set([key for key in keys
-                          if key not in ["cls", "serialization_path"]])
+        valid_keys = set(
+            [
+                key for key in keys
+                if key not in ["cls", "serialization_path"]
+            ]
+        )
 
         input_keys = set()
         for key in pipeline:
-            if key in ["categorical_data_processor", "discretizer",
-                       "target_encoder"]:
+            if key in [
+                "categorical_data_processor",
+                "discretizer",
+                "target_encoder"
+            ]:
                 input_keys = input_keys.union(set(pipeline[key].keys()))
             elif key != "metadata":
                 input_keys.add(key)
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index 0a9028f..7485b6b 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -76,16 +76,22 @@ def __init__(
         if weight < 0:
             raise ValueError("The value of weight cannot be smaller than zero.")
         elif imputation_strategy not in self.valid_imputation_strategies:
-            raise ValueError("Valid options for 'imputation_strategy' are {}."
-                             " Got imputation_strategy={!r} instead."
-                             .format(self.valid_imputation_strategies,
-                                     imputation_strategy))
+            raise ValueError(
+                "Valid options for 'imputation_strategy' are {}. "
+                "Got imputation_strategy={!r} instead."
+                .format(
+                    self.valid_imputation_strategies,
+                    imputation_strategy
+                )
+            )
 
         if weight == 0:
-            log.warning("The target encoder's additive smoothing weight is "
-                        "set to 0. This disables smoothing and may make the "
-                        "encoding prone to overfitting. Increase the weight "
-                        "if needed.")
+            log.warning(
+                "The target encoder's additive smoothing weight is "
+                "set to 0. This disables smoothing and may make the "
+                "encoding prone to overfitting. Increase the weight "
+                "if needed."
+            )
 
         self.weight = weight
         self.imputation_strategy = imputation_strategy
@@ -149,8 +155,12 @@ def dict_to_series(key, value):
 
         return self
 
-    def fit(self, data: pd.DataFrame, column_names: list,
-            target_column: str):
+    def fit(
+        self,
+        data: pd.DataFrame,
+        column_names: list,
+        target_column: str
+    ):
         """Fit the TargetEncoder to the data.
 
         Parameters
@@ -169,8 +179,11 @@ def fit(self, data: pd.DataFrame, column_names: list,
 
         for column in tqdm(column_names, desc="Fitting target encoding..."):
             if column not in data.columns:
-                log.warning("DataFrame has no column '{}', so it will be "
-                            "skipped in fitting" .format(column))
+                log.warning(
+                    "DataFrame has no column '{}', so it will be "
+                    "skipped in fitting"
+                    .format(column)
+                )
                 continue
 
             self._mapping[column] = self._fit_column(data[column], y)
@@ -205,8 +218,11 @@ def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series:
 
         return numerator / denominator
 
-    def transform(self, data: pd.DataFrame,
-                  column_names: list) -> pd.DataFrame:
+    def transform(
+        self,
+        data: pd.DataFrame,
+        column_names: list
+    ) -> pd.DataFrame:
         """Replace (e.g. encode) values of each categorical column with a
         new value (reflecting the corresponding average target value,
         optionally smoothed by a regularization weight),
@@ -231,8 +247,10 @@ def transform(self, data: pd.DataFrame,
             method.
         """
         if (len(self._mapping) == 0) or (self._global_mean is None):
-            msg = ("This {} instance is not fitted yet. Call 'fit' with "
-                   "appropriate arguments before using this method.")
+            msg = (
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column in tqdm(column_names, desc="Applying target encoding..."):
@@ -248,8 +266,11 @@ def transform(self, data: pd.DataFrame,
 
         return data
 
-    def _transform_column(self, data: pd.DataFrame,
-                          column_name: str) -> pd.DataFrame:
+    def _transform_column(
+        self,
+        data: pd.DataFrame,
+        column_name: str
+    ) -> pd.DataFrame:
         """Replace (e.g. encode) values of a categorical column with a
         new value (reflecting the corresponding average target value,
         optionally smoothed by a regularization weight),
@@ -272,8 +293,10 @@ def _transform_column(self, data: pd.DataFrame,
         # Convert dtype to float, because when the original dtype
         # is of type "category", the resulting dtype would otherwise also be of
         # type "category":
-        data[new_column] = (data[column_name].map(self._mapping[column_name])
-                            .astype("float"))
+        data[new_column] = (
+            data[column_name].map(self._mapping[column_name])
+            .astype("float")
+        )
 
         # In case of categorical data, it could be that new categories will
         # emerge which were not present in the train set, so this will result
@@ -281,20 +304,20 @@ def _transform_column(self, data: pd.DataFrame,
         # configured imputation strategy:
         if data[new_column].isnull().sum() > 0:
             if self.imputation_strategy == "mean":
-                data[new_column].fillna(self._global_mean,
-                                        inplace=True)
+                data[new_column].fillna(self._global_mean, inplace=True)
             elif self.imputation_strategy == "min":
-                data[new_column].fillna(data[new_column].min(),
-                                        inplace=True)
+                data[new_column].fillna(data[new_column].min(), inplace=True)
             elif self.imputation_strategy == "max":
-                data[new_column].fillna(data[new_column].max(),
-                                        inplace=True)
+                data[new_column].fillna(data[new_column].max(), inplace=True)
 
         return data
 
-    def fit_transform(self, data: pd.DataFrame,
-                      column_names: list,
-                      target_column: str) -> pd.DataFrame:
+    def fit_transform(
+        self,
+        data: pd.DataFrame,
+        column_names: list,
+        target_column: str
+    ) -> pd.DataFrame:
         """Fit the encoder and transform the data.
 
         Parameters

From 39a28fbf3cf6111b50eeafc813ca43485b7f37da Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 20 May 2022 11:07:41 +0200
Subject: [PATCH 5/9] feat: replace pylint with black

---
 .pylintrc                                     | 585 ------------------
 Makefile                                      |  11 +-
 cobra/evaluation/__init__.py                  |  20 +-
 cobra/evaluation/evaluator.py                 | 261 +++-----
 cobra/evaluation/pigs_tables.py               | 149 ++---
 cobra/evaluation/plotting_utils.py            | 102 ++-
 cobra/model_building/__init__.py              |  14 +-
 cobra/model_building/forward_selection.py     | 164 ++---
 cobra/model_building/models.py                |  86 +--
 cobra/model_building/univariate_selection.py  |  54 +-
 cobra/preprocessing/__init__.py               |   5 +-
 .../categorical_data_processor.py             | 161 ++---
 cobra/preprocessing/kbins_discretizer.py      | 150 ++---
 cobra/preprocessing/preprocessor.py           | 194 ++----
 cobra/preprocessing/target_encoder.py         |  75 +--
 cobra/utils.py                                |   4 +-
 requirements.dev.txt                          |   2 +-
 17 files changed, 472 insertions(+), 1565 deletions(-)
 delete mode 100644 .pylintrc

diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index ee9601a..0000000
--- a/.pylintrc
+++ /dev/null
@@ -1,585 +0,0 @@
-[MASTER]
-
-# Specify a configuration file.
-#rcfile=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Files or directories to be skipped. They should be base names, not
-# paths.
-ignore=CVS
-
-# Add files or directories matching the regex patterns to the ignore-list. The
-# regex matches against paths and can be in Posix or Windows format.
-ignore-paths=
-
-# Files or directories matching the regex patterns are skipped. The regex
-# matches against base names, not paths.
-ignore-patterns=^\.#
-
-# Pickle collected data for later comparisons.
-persistent=yes
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-    pylint.extensions.check_elif,
-    pylint.extensions.bad_builtin,
-    pylint.extensions.docparams,
-    pylint.extensions.for_any_all,
-    pylint.extensions.set_membership,
-    pylint.extensions.code_style,
-    pylint.extensions.overlapping_exceptions,
-    pylint.extensions.typing,
-    pylint.extensions.redefined_variable_type,
-    pylint.extensions.comparison_placement,
-
-# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
-# number of processors available to use.
-jobs=1
-
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code
-extension-pkg-allow-list=
-
-# Minimum supported python version
-py-version = 3.7.2
-
-# Control the amount of potential inferred values when inferring a single
-# object. This can help the performance when dealing with large functions or
-# complex, nested conditions.
-limit-inference-results=100
-
-# Specify a score threshold to be exceeded before program exits with error.
-fail-under=10.0
-
-# Return non-zero exit code if any of these messages/categories are detected,
-# even if score is above --fail-under value. Syntax same as enable. Messages
-# specified are enabled, while categories only check already-enabled messages.
-fail-on=
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
-# confidence=
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=
-    use-symbolic-message-instead,
-    useless-suppression,
-    fixme
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once).You can also use "--disable=all" to
-# disable everything first and then re-enable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use"--disable=all --enable=classes
-# --disable=W"
-
-disable=
-    attribute-defined-outside-init,
-    duplicate-code,
-    invalid-name,
-    missing-docstring,
-    protected-access,
-    too-few-public-methods,
-    # handled by black
-    format,
-
-
-[REPORTS]
-
-# Set the output format. Available formats are text, parseable, colorized, msvs
-# (visual studio) and html. You can also give a reporter class, eg
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages
-reports=no
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables 'fatal', 'error', 'warning', 'refactor', 'convention'
-# and 'info', which contain the number of messages in each category, as
-# well as 'statement', which is the total number of statements analyzed. This
-# score is used by the global evaluation report (RP0004).
-evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details
-#msg-template=
-
-# Activate the evaluation score.
-score=yes
-
-
-[LOGGING]
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format
-logging-modules=logging
-
-# The type of string formatting that logging methods do. `old` means using %
-# formatting, `new` is for `{}` formatting.
-logging-format-style=old
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,XXX,TODO
-
-# Regular expression of note tags to take in consideration.
-#notes-rgx=
-
-
-[SIMILARITIES]
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-# Signatures are removed from the similarity computation
-ignore-signatures=no
-
-
-[VARIABLES]
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=_$|dummy
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid defining new builtins when possible.
-additional-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-
-# Tells whether unused global variables should be treated as a violation.
-allow-global-unused-variables=yes
-
-# List of names allowed to shadow builtins
-allowed-redefined-builtins=
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore.
-ignored-argument-names=_.*
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
-
-
-[FORMAT]
-
-# Maximum number of characters on a single line.
-max-line-length=100
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-# Allow the body of a class to be on the same line as the declaration if body
-# contains single statement.
-single-line-class-stmt=no
-
-# Maximum number of lines in a module
-max-module-lines=2000
-
-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-
-[BASIC]
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=i,j,k,ex,Run,_,
-    ax,
-    cv,
-    df,
-    exc,
-    i,
-    j,
-    l,
-    lr,
-    m,
-    n,
-    q,
-    qq,
-    s,
-    t,
-    v,
-    x,
-    X,
-    X_train,
-    X_test,
-    y,
-
-
-# Good variable names regexes, separated by a comma. If names match any regex,
-# they will always be accepted
-good-names-rgxs=
-
-# Bad variable names which should always be refused, separated by a comma
-bad-names=foo,bar,baz,toto,tutu,tata
-
-# Bad variable names regexes, separated by a comma. If names match any regex,
-# they will always be refused
-bad-names-rgxs=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Include a hint for the correct naming format with invalid-name
-include-naming-hint=no
-
-# Naming style matching correct function names.
-function-naming-style=snake_case
-
-# Regular expression matching correct function names
-function-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming style matching correct variable names.
-variable-naming-style=snake_case
-
-# Regular expression matching correct variable names
-variable-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming style matching correct constant names.
-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct constant names
-const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
-
-# Naming style matching correct attribute names.
-attr-naming-style=snake_case
-
-# Regular expression matching correct attribute names
-attr-rgx=[a-z_][a-z0-9_]{2,}$
-
-# Naming style matching correct argument names.
-argument-naming-style=snake_case
-
-# Regular expression matching correct argument names
-argument-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming style matching correct class attribute names.
-class-attribute-naming-style=any
-
-# Regular expression matching correct class attribute names
-class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
-
-# Naming style matching correct class constant names.
-class-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct class constant names. Overrides class-
-# const-naming-style.
-#class-const-rgx=
-
-# Naming style matching correct inline iteration names.
-inlinevar-naming-style=any
-
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
-
-# Naming style matching correct class names.
-class-naming-style=PascalCase
-
-# Regular expression matching correct class names
-class-rgx=[A-Z_][a-zA-Z0-9]+$
-
-
-# Naming style matching correct module names.
-module-naming-style=snake_case
-
-# Regular expression matching correct module names
-module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
-
-
-# Naming style matching correct method names.
-method-naming-style=snake_case
-
-# Regular expression matching correct method names
-method-rgx=[a-z_][a-z0-9_]{2,}$
-
-# Regular expression which can overwrite the naming style set by typevar-naming-style.
-#typevar-rgx=
-
-# Regular expression which should only match function or class names that do
-# not require a docstring. Use ^(?!__init__$)_ to also check __init__.
-no-docstring-rgx=__.*__
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-# List of decorators that define properties, such as abc.abstractproperty.
-property-classes=abc.abstractproperty
-
-
-[TYPECHECK]
-
-# Regex pattern to define which classes are considered mixins if ignore-mixin-
-# members is set to 'yes'
-mixin-class-rgx=.*MixIn
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis). It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=REQUEST,acl_users,aq_parent,argparse.Namespace
-
-# List of decorators that create context managers from functions, such as
-# contextlib.contextmanager.
-contextmanager-decorators=contextlib.contextmanager
-
-# Tells whether to warn about missing members when the owner of the attribute
-# is inferred to be None.
-ignore-none=yes
-
-# This flag controls whether pylint should warn about no-member and similar
-# checks whenever an opaque object is returned when inferring. The inference
-# can return multiple potential results while evaluating a Python object, but
-# some branches might not be evaluated, which results in partial inference. In
-# that case, it might be useful to still emit no-member and other checks for
-# the rest of the inferred objects.
-ignore-on-opaque-inference=yes
-
-# Show a hint with possible names when a member name was not found. The aspect
-# of finding the hint is based on edit distance.
-missing-member-hint=yes
-
-# The minimum edit distance a name should have in order to be considered a
-# similar match for a missing member name.
-missing-member-hint-distance=1
-
-# The total number of similar names that should be taken in consideration when
-# showing a hint for a missing member.
-missing-member-max-choices=1
-
-[SPELLING]
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package.
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# List of comma separated words that should be considered directives if they
-# appear and the beginning of a comment and should not be checked.
-spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-# Limits count of emitted suggestions for spelling mistakes.
-max-spelling-suggestions=4
-
-
-[DESIGN]
-
-# Maximum number of arguments for function / method
-max-args=10
-
-# Maximum number of locals for function / method body
-max-locals=25
-
-# Maximum number of return / yield for function / method body
-max-returns=11
-
-# Maximum number of branch for function / method body
-max-branches=27
-
-# Maximum number of statements in function / method body
-max-statements=100
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# List of qualified class names to ignore when counting class parents (see R0901).
-ignored-parents=
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=11
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=25
-
-# Maximum number of boolean expressions in an if statement (see R0916).
-max-bool-expr=5
-
-# List of regular expressions of class ancestor names to
-# ignore when counting public methods (see R0903).
-exclude-too-few-public-methods=
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,__new__,setUp,__post_init__
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,_fields,_replace,_source,_make
-
-# Warn about protected attribute access inside special methods
-check-protected-access-in-special-methods=no
-
-[IMPORTS]
-
-# List of modules that can be imported at any level, not just the top level
-# one.
-allow-any-import-level=
-
-# Allow wildcard imports from modules that define __all__.
-allow-wildcard-with-all=no
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-# Deprecated modules which should not be used, separated by a comma
-deprecated-modules=regsub,TERMIOS,Bastion,rexec
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled)
-import-graph=
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled)
-ext-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled)
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant
-
-# Couples of modules and preferred modules, separated by a comma.
-preferred-modules=
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception
-
-
-[TYPING]
-
-# Set to ``no`` if the app / library does **NOT** need to support runtime
-# introspection of type annotations. If you use type annotations
-# **exclusively** for type checking of an application, you're probably fine.
-# For libraries, evaluate if some users what to access the type hints at
-# runtime first, e.g., through ``typing.get_type_hints``. Applies to Python
-# versions 3.7 - 3.9
-runtime-typing = no
-
-
-[DEPRECATED_BUILTINS]
-
-# List of builtins function names that should not be used, separated by a comma
-bad-functions=map,input
-
-
-[REFACTORING]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-# Complete name of functions that never returns. When checking for
-# inconsistent-return-statements if a never returning function is called then
-# it will be considered as an explicit return statement and no message will be
-# printed.
-never-returning-functions=sys.exit,argparse.parse_error
-
-
-[STRING]
-
-# This flag controls whether inconsistent-quotes generates a warning when the
-# character used as a quote delimiter is used inconsistently within a module.
-check-quote-consistency=no
-
-# This flag controls whether the implicit-str-concat should generate a warning
-# on implicit string concatenation in sequences defined over several lines.
-check-str-concat-over-line-jumps=no
-
-
-[CODE_STYLE]
-
-# Max line length for which to sill emit suggestions. Used to prevent optional
-# suggestions which would get split by a code formatter (e.g., black). Will
-# default to the setting for ``max-line-length``.
-#max-line-length-suggestions=
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 29466d4..4789718 100644
--- a/Makefile
+++ b/Makefile
@@ -18,13 +18,8 @@ test-unit:
 	pytest tests
 	@echo 'unit tests OK'
 
-lint:
-	pylint cobra
-	@echo 'lint OK'
-
-lint-minimal:
-	pylint -E cobra
-	@echo 'lint minimal OK'
+black-check:
+	black --diff --line-length 120 cobra/
 
 typecheck:
 	mypy cobra --allow-redefinition --allow-untyped-globals --ignore-missing-imports
@@ -38,4 +33,4 @@ docstyle:
 	pydocstyle cobra
 	@echo 'docstyle OK'
 
-code-qa: typecheck codestyle docstyle lint-minimal
+code-qa: typecheck codestyle docstyle
diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py
index d480bdb..8302ea9 100644
--- a/cobra/evaluation/__init__.py
+++ b/cobra/evaluation/__init__.py
@@ -13,12 +13,14 @@
 # from .evaluator import Evaluator
 from .evaluator import ClassificationEvaluator, RegressionEvaluator
 
-__all__ = ["generate_pig_tables",
-           "compute_pig_table",
-           "plot_incidence",
-           "plot_performance_curves",
-           "plot_variable_importance",
-           "plot_univariate_predictor_quality",
-           "plot_correlation_matrix",
-           "ClassificationEvaluator",
-           "RegressionEvaluator"]
+__all__ = [
+    "generate_pig_tables",
+    "compute_pig_table",
+    "plot_incidence",
+    "plot_performance_curves",
+    "plot_variable_importance",
+    "plot_univariate_predictor_quality",
+    "plot_correlation_matrix",
+    "ClassificationEvaluator",
+    "RegressionEvaluator",
+]
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index 3255fa2..22e034b 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -1,5 +1,6 @@
 """Evaluate the created model."""
 
+from typing import Any, Union, cast
 import numpy as np
 import pandas as pd
 
@@ -30,7 +31,7 @@
 DEFAULT_LABELS = ["0", "1"]
 
 
-class ClassificationEvaluator():
+class ClassificationEvaluator:
     """Evaluator class encapsulating classification model metrics and plotting functionality.
 
     Attributes
@@ -60,26 +61,21 @@ class ClassificationEvaluator():
         (by default 10, so deciles).
     """
 
-    def __init__(
-        self,
-        probability_cutoff: float = None,
-        lift_at: float = 0.05,
-        n_bins: int = 10
-    ):
+    def __init__(self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10):
         """Initialize the ClassificationEvaluator."""
-        self.y_true = None
-        self.y_pred = None
+        self.y_true: np.ndarray
+        self.y_pred: np.ndarray
 
         self.lift_at = lift_at
         self.probability_cutoff = probability_cutoff
         self.n_bins = n_bins
 
         # Placeholder to store fitted output
-        self.scalar_metrics = None
-        self.roc_curve = None
-        self.confusion_matrix = None
-        self.lift_curve = None
-        self.cumulative_gains = None
+        self.scalar_metrics: pd.Series
+        self.roc_curve: dict[str, Any]
+        self.confusion_matrix: np.ndarray
+        self.lift_curve: tuple[list[float], list[float], float]
+        self.cumulative_gains: tuple[np.ndarray, np.ndarray]
 
     def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
         """Fit the evaluator by computing the relevant evaluation metrics on the inputs.
@@ -95,20 +91,14 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
 
         # if probability_cutoff is not set, take the optimal cut-off
         if not self.probability_cutoff:
-            self.probability_cutoff = (ClassificationEvaluator.
-                                       _compute_optimal_cutoff(fpr, tpr,
-                                                               thresholds))
+            self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
 
         # Transform probabilities to binary array using cut-off
-        y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1
-                             for pred in y_pred])
+        y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 for pred in y_pred])
 
         # Compute the various evaluation metrics
-        self.scalar_metrics = ClassificationEvaluator._compute_scalar_metrics(
-            y_true,
-            y_pred,
-            y_pred_b,
-            self.lift_at
+        self.scalar_metrics = cast(
+            pd.Series, ClassificationEvaluator._compute_scalar_metrics(y_true, y_pred, y_pred_b, self.lift_at)
         )
 
         self.y_true = y_true
@@ -121,10 +111,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
 
     @staticmethod
     def _compute_scalar_metrics(
-        y_true: np.ndarray,
-        y_pred: np.ndarray,
-        y_pred_b: np.ndarray,
-        lift_at: float
+        y_true: np.ndarray, y_pred: np.ndarray, y_pred_b: np.ndarray, lift_at: float
     ) -> pd.Series:
         """Compute various scalar performance measures.
 
@@ -157,21 +144,19 @@ def _compute_scalar_metrics(
             The `column_order` and `pig_tables` parameters do not contain
             the same set of variables.
         """
-        return pd.Series({
-            "accuracy": accuracy_score(y_true, y_pred_b),
-            "AUC": roc_auc_score(y_true, y_pred),
-            "precision": precision_score(y_true, y_pred_b),
-            "recall": recall_score(y_true, y_pred_b),
-            "F1": f1_score(y_true, y_pred_b, average=None)[1],
-            "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
-            f"lift at {lift_at}": np.round(
-                ClassificationEvaluator
-                ._compute_lift(
-                    y_true=y_true,
-                    y_pred=y_pred,
-                    lift_at=lift_at
-                ), 2)
-        })
+        return pd.Series(
+            {
+                "accuracy": accuracy_score(y_true, y_pred_b),
+                "AUC": roc_auc_score(y_true, y_pred),
+                "precision": precision_score(y_true, y_pred_b),
+                "recall": recall_score(y_true, y_pred_b),
+                "F1": f1_score(y_true, y_pred_b, average=None)[1],
+                "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
+                f"lift at {lift_at}": np.round(
+                    ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=lift_at), 2
+                ),
+            }
+        )
 
     def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
         """Plot ROC curve of the model.
@@ -190,8 +175,7 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
         """
         if self.roc_curve is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -199,13 +183,15 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
 
         with plt.style.context("seaborn-whitegrid"):
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-            ax.plot(self.roc_curve["fpr"],
-                    self.roc_curve["tpr"],
-                    color="cornflowerblue", linewidth=3,
-                    label=f"ROC curve (area = {auc:.3})")
+            ax.plot(
+                self.roc_curve["fpr"],
+                self.roc_curve["tpr"],
+                color="cornflowerblue",
+                linewidth=3,
+                label=f"ROC curve (area = {auc:.3})",
+            )
 
-            ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3,
-                    linestyle="--")
+            ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3, linestyle="--")
             ax.set_xlabel("False Positive Rate", fontsize=15)
             ax.set_ylabel("True Positive Rate", fontsize=15)
             ax.legend(loc="lower right")
@@ -216,11 +202,7 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
 
         plt.show()
 
-    def plot_confusion_matrix(
-        self, path: str = None,
-        dim: tuple = (12, 8),
-        labels: list = None
-    ):
+    def plot_confusion_matrix(self, path: str = None, dim: tuple = (12, 8), labels: list = None):
         """Plot the confusion matrix.
 
         Parameters
@@ -240,8 +222,7 @@ def plot_confusion_matrix(
         labels = labels or DEFAULT_LABELS
         if self.confusion_matrix is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -249,8 +230,10 @@ def plot_confusion_matrix(
         ax = sns.heatmap(
             self.confusion_matrix,
             annot=self.confusion_matrix.astype(str),
-            fmt="s", cmap="Blues",
-            xticklabels=labels, yticklabels=labels
+            fmt="s",
+            cmap="Blues",
+            xticklabels=labels,
+            yticklabels=labels,
         )
         ax.set_title("Confusion matrix", fontsize=20)
 
@@ -276,35 +259,30 @@ def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8))
         """
         if self.lift_curve is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         x_labels, lifts, inc_rate = self.lift_curve
-        lifts = np.array(lifts)*inc_rate*100
+        lifts = np.array(lifts) * inc_rate * 100
 
         with plt.style.context("seaborn-ticks"):
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
-            plt.bar(
-                x_labels[::-1],
-                lifts,
-                align="center",
-                color="cornflowerblue")
+            plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue")
             plt.ylabel("response (%)", fontsize=16)
             plt.xlabel("decile", fontsize=16)
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
             plt.axhline(
-                y=inc_rate*100,
+                y=inc_rate * 100,
                 color="darkorange",
                 linestyle="--",
                 xmin=0.05,
                 xmax=0.95,
                 linewidth=3,
-                label="Incidence"
+                label="Incidence",
             )
 
             # Legend
@@ -341,8 +319,7 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
         """
         if self.lift_curve is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -351,22 +328,13 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
         with plt.style.context("seaborn-ticks"):
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
-            plt.bar(x_labels[::-1], lifts, align="center",
-                    color="cornflowerblue")
+            plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue")
             plt.ylabel("lift", fontsize=16)
             plt.xlabel("decile", fontsize=16)
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
-            plt.axhline(
-                y=1,
-                color="darkorange",
-                linestyle="--",
-                xmin=0.05,
-                xmax=0.95,
-                linewidth=3,
-                label="Baseline"
-            )
+            plt.axhline(y=1, color="darkorange", linestyle="--", xmin=0.05, xmax=0.95, linewidth=3, label="Baseline")
 
             # Legend
             ax.legend(loc="upper right")
@@ -398,11 +366,14 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)):
         with plt.style.context("seaborn-whitegrid"):
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
-            ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100,
-                    color="cornflowerblue", linewidth=3,
-                    label="cumulative gains")
-            ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3,
-                    ls="--", color="darkorange", label="random selection")
+            ax.plot(
+                self.cumulative_gains[0] * 100,
+                self.cumulative_gains[1] * 100,
+                color="cornflowerblue",
+                linewidth=3,
+                label="cumulative gains",
+            )
+            ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, ls="--", color="darkorange", label="random selection")
 
             ax.set_title("Cumulative Gains curve", fontsize=20)
 
@@ -427,10 +398,7 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)):
             plt.show()
 
     @staticmethod
-    def _find_optimal_cutoff(
-        y_true: np.ndarray,
-        y_pred: np.ndarray
-    ) -> float:
+    def _find_optimal_cutoff(y_true: np.ndarray, y_pred: np.ndarray) -> float:
         """Find the optimal probability cut off point for a classification model.
 
         Parameters
@@ -449,11 +417,7 @@ def _find_optimal_cutoff(
         return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
 
     @staticmethod
-    def _compute_optimal_cutoff(
-        fpr: np.ndarray,
-        tpr: np.ndarray,
-        thresholds: np.ndarray
-    ) -> float:
+    def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray) -> float:
         """Calculate the optimal probability cut-off point for a classification model.
 
         The optimal cut-off would be where TPR is high and FPR is low, hence
@@ -473,7 +437,7 @@ def _compute_optimal_cutoff(
         float
             Optimal probability cut-off point.
         """
-        temp = np.absolute(tpr - (1-fpr))
+        temp = np.absolute(tpr - (1 - fpr))
 
         # index for optimal value is the one for which temp is minimal
         optimal_index = np.where(temp == min(temp))[0]
@@ -481,10 +445,7 @@ def _compute_optimal_cutoff(
         return thresholds[optimal_index][0]
 
     @staticmethod
-    def _compute_cumulative_gains(
-        y_true: np.ndarray,
-        y_pred: np.ndarray
-    ) -> tuple:
+    def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
         """Compute cumulative gains of the model.
 
         Code from (https://github.com/reiinakano/scikit-plot/blob/
@@ -504,7 +465,7 @@ def _compute_cumulative_gains(
             With x-labels, and gains.
         """
         # make y_true a boolean vector
-        y_true = (y_true == 1)
+        y_true = y_true == 1
 
         sorted_indices = np.argsort(y_pred)[::-1]
         y_true = y_true[sorted_indices]
@@ -522,10 +483,8 @@ def _compute_cumulative_gains(
 
     @staticmethod
     def _compute_lift_per_bin(
-        y_true: np.ndarray,
-        y_pred: np.ndarray,
-        n_bins: int = 10
-    ) -> tuple:
+        y_true: np.ndarray, y_pred: np.ndarray, n_bins: int = 10
+    ) -> tuple[list[float], list[float], float]:
         """Compute lift of the model for a given number of bins.
 
         Parameters
@@ -544,25 +503,16 @@ def _compute_lift_per_bin(
             Includes x-labels, lifts per decile, and target incidence.
         """
         lifts = [
-            ClassificationEvaluator
-            ._compute_lift(
-                y_true=y_true,
-                y_pred=y_pred,
-                lift_at=perc_lift
-            )
-            for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)
+            ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=perc_lift)
+            for perc_lift in np.linspace(1 / n_bins, 1, num=n_bins, endpoint=True)
         ]
 
-        x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)]
+        x_labels = [len(lifts) - x for x in np.arange(0, len(lifts), 1)]
 
-        return x_labels, lifts, y_true.mean()
+        return x_labels, lifts, cast(float, y_true.mean())
 
     @staticmethod
-    def _compute_lift(
-        y_true: np.ndarray,
-        y_pred: np.ndarray,
-        lift_at: float = 0.05
-    ) -> float:
+    def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05) -> float:
         """Calculate lift on a specified level.
 
         Parameters
@@ -592,22 +542,19 @@ def _compute_lift(
 
         # Calculate necessary variables
         nrows = len(y_data)
-        stop = int(np.floor(nrows*lift_at))
-        avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_))
+        stop = int(np.floor(nrows * lift_at))
+        avg_incidence = np.einsum("ij->j", y_true_) / float(len(y_true_))
 
         # Sort and filter data
-        data_sorted = (
-            y_data[y_data[:, 1].argsort()[::-1]][:stop, 0]
-            .reshape(stop, 1)
-        )
+        data_sorted = y_data[y_data[:, 1].argsort()[::-1]][:stop, 0].reshape(stop, 1)
 
         # Calculate lift (einsum is a very fast way of summing, but needs specific shape)
-        inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted))
-        lift = np.round(inc_in_top_n/avg_incidence, 2)[0]
+        inc_in_top_n = np.einsum("ij->j", data_sorted) / float(len(data_sorted))
+        lift = np.round(inc_in_top_n / avg_incidence, 2)[0]
         return lift
 
 
-class RegressionEvaluator():
+class RegressionEvaluator:
     """Evaluator class encapsulating regression model metrics and plotting functionality.
 
     Attributes
@@ -651,10 +598,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
         self.qq = RegressionEvaluator._compute_qq_residuals(y_true, y_pred)
 
     @staticmethod
-    def _compute_scalar_metrics(
-        y_true: np.ndarray,
-        y_pred: np.ndarray
-    ) -> pd.Series:
+    def _compute_scalar_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series:
         """Compute various scalar performance measures.
 
         Parameters
@@ -673,18 +617,17 @@ def _compute_scalar_metrics(
                 Mean squared error (expected value of the quadratic error)
                 Root mean squared error (sqrt of expected value of the quadratic error)
         """
-        return pd.Series({
-            "R2": r2_score(y_true, y_pred),
-            "MAE": mean_absolute_error(y_true, y_pred),
-            "MSE": mean_squared_error(y_true, y_pred),
-            "RMSE": sqrt(mean_squared_error(y_true, y_pred))
-        })
+        return pd.Series(
+            {
+                "R2": r2_score(y_true, y_pred),
+                "MAE": mean_absolute_error(y_true, y_pred),
+                "MSE": mean_squared_error(y_true, y_pred),
+                "RMSE": sqrt(mean_squared_error(y_true, y_pred)),
+            }
+        )
 
     @staticmethod
-    def _compute_qq_residuals(
-        y_true: np.ndarray,
-        y_pred: np.ndarray
-    ) -> pd.Series:
+    def _compute_qq_residuals(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series:
         """Compute various scalar performance measures.
 
         Parameters
@@ -706,15 +649,17 @@ def _compute_qq_residuals(
         df = pd.DataFrame({"res": sorted((y_true - y_pred))})  # ascending order
         m, s = df["res"].mean(), df["res"].std()
 
-        df["z_res"] = df["res"].apply(lambda x: (x-m)/s)
-        df["rank"] = df.index+1
-        df["percentile"] = df["rank"].apply(lambda x: x/(n+1))  # divide by n+1 to avoid inf
+        df["z_res"] = df["res"].apply(lambda x: (x - m) / s)
+        df["rank"] = df.index + 1
+        df["percentile"] = df["rank"].apply(lambda x: x / (n + 1))  # divide by n+1 to avoid inf
         df["q_theoretical"] = norm.ppf(df["percentile"])
 
-        return pd.Series({
-            "quantiles": df["q_theoretical"].values,
-            "residuals": df["z_res"].values,
-        })
+        return pd.Series(
+            {
+                "quantiles": df["q_theoretical"].values,
+                "residuals": df["z_res"].values,
+            }
+        )
 
     def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
         """Plot predictions from the model against actual values.
@@ -733,8 +678,7 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
         """
         if self.y_true is None and self.y_pred is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -744,7 +688,7 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
         with plt.style.context("seaborn-whitegrid"):
             fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
 
-            x = np.arange(1, len(y_true)+1)
+            x = np.arange(1, len(y_true) + 1)
 
             ax.plot(x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3)
             ax.plot(x, y_pred, label="predictions", color="cornflowerblue", linewidth=3)
@@ -775,10 +719,7 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)):
             The instance is not fitted yet.
         """
         if self.qq is None:
-            msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
-            )
+            msg = "This {} instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         with plt.style.context("seaborn-whitegrid"):
@@ -791,10 +732,10 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)):
             ax.plot(x, y, label="current model", color="cornflowerblue", linewidth=3)
 
             ax.set_xlabel("Theoretical quantiles", fontsize=15)
-            ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")])))+1, 1))
+            ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1))
 
             ax.set_ylabel("Standardized residuals", fontsize=15)
-            ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1))
+            ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1))
 
             ax.legend(loc="best")
             ax.set_title("Q-Q plot", fontsize=20)
diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
index e728dd0..dfbd85c 100644
--- a/cobra/evaluation/pigs_tables.py
+++ b/cobra/evaluation/pigs_tables.py
@@ -10,10 +10,7 @@
 
 
 def generate_pig_tables(
-    basetable: pd.DataFrame,
-    id_column_name: str,
-    target_column_name: str,
-    preprocessed_predictors: list
+    basetable: pd.DataFrame, id_column_name: str, target_column_name: str, preprocessed_predictors: list
 ) -> pd.DataFrame:
     """Compute PIG tables for all predictors in preprocessed_predictors.
 
@@ -38,12 +35,7 @@ def generate_pig_tables(
         DataFrame containing a PIG table for all predictors.
     """
     pigs = [
-        compute_pig_table(
-            basetable,
-            column_name,
-            target_column_name,
-            id_column_name
-        )
+        compute_pig_table(basetable, column_name, target_column_name, id_column_name)
         for column_name in sorted(preprocessed_predictors)
         if column_name not in [id_column_name, target_column_name]
     ]
@@ -52,10 +44,7 @@ def generate_pig_tables(
 
 
 def compute_pig_table(
-    basetable: pd.DataFrame,
-    predictor_column_name: str,
-    target_column_name: str,
-    id_column_name: str
+    basetable: pd.DataFrame, predictor_column_name: str, target_column_name: str, id_column_name: str
 ) -> pd.DataFrame:
     """Compute the PIG table of a given predictor for a given target.
 
@@ -81,15 +70,10 @@ def compute_pig_table(
     # (=mean of the target for the given bin) and compute the bin size
     # (e.g. COUNT(id_column_name)). After that, rename the columns
     res = (
-        basetable
-        .groupby(predictor_column_name)
+        basetable.groupby(predictor_column_name)
         .agg({target_column_name: "mean", id_column_name: "size"})
         .reset_index()
-        .rename(columns={
-            predictor_column_name: "label",
-            target_column_name: "avg_target",
-            id_column_name: "pop_size"
-        })
+        .rename(columns={predictor_column_name: "label", target_column_name: "avg_target", id_column_name: "pop_size"})
     )
 
     # add the column name to a variable column
@@ -97,21 +81,16 @@ def compute_pig_table(
     # replace population size by a percentage of total population
     res["variable"] = utils.clean_predictor_name(predictor_column_name)
     res["global_avg_target"] = global_avg_target
-    res["pop_size"] = res["pop_size"]/len(basetable.index)
+    res["pop_size"] = res["pop_size"] / len(basetable.index)
 
     # make sure to always return the data with the proper column order
-    column_order = ["variable", "label", "pop_size",
-                    "global_avg_target", "avg_target"]
+    column_order = ["variable", "label", "pop_size", "global_avg_target", "avg_target"]
 
     return res[column_order]
 
 
 def plot_incidence(
-    pig_tables: pd.DataFrame,
-    variable: str,
-    model_type: str,
-    column_order: list = None,
-    dim: tuple = (12, 8)
+    pig_tables: pd.DataFrame, variable: str, model_type: str, column_order: list = None, dim: tuple = (12, 8)
 ):
     """Plot a Predictor Insights Graph (PIG).
 
@@ -146,29 +125,22 @@ def plot_incidence(
     """
     if model_type not in ["classification", "regression"]:
         raise ValueError(
-            "An unexpected value was set for the model_type "
-            "parameter. Expected 'classification' or "
-            "'regression'."
+            "An unexpected value was set for the model_type " "parameter. Expected 'classification' or " "'regression'."
         )
 
-    df_plot = pig_tables[pig_tables['variable'] == variable].copy()
+    df_plot = pig_tables[pig_tables["variable"] == variable].copy()
 
     if column_order is not None:
-        if not set(df_plot['label']) == set(column_order):
-            raise ValueError(
-                'The column_order and pig_tables parameters do not contain '
-                'the same set of variables.')
-
-        df_plot['label'] = df_plot['label'].astype('category')
-        df_plot['label'].cat.reorder_categories(
-            column_order,
-            inplace=True
-        )
+        if not set(df_plot["label"]) == set(column_order):
+            raise ValueError("The column_order and pig_tables parameters do not contain " "the same set of variables.")
+
+        df_plot["label"] = df_plot["label"].astype("category")
+        df_plot["label"].cat.reorder_categories(column_order, inplace=True)
 
-        df_plot.sort_values(by=['label'], ascending=True, inplace=True)
+        df_plot.sort_values(by=["label"], ascending=True, inplace=True)
         df_plot.reset_index(inplace=True)
     else:
-        df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True)
+        df_plot.sort_values(by=["avg_target"], ascending=False, inplace=True)
         df_plot.reset_index(inplace=True)
 
     with plt.style.context("seaborn-ticks"):
@@ -177,41 +149,42 @@ def plot_incidence(
         # --------------------------
         # Left axis - average target
         # --------------------------
-        ax.plot(df_plot['label'], df_plot['avg_target'],
-                color="#00ccff", marker=".",
-                markersize=20, linewidth=3,
-                label='incidence rate per bin' if model_type == "classification" else "mean target value per bin",
-                zorder=10)
+        ax.plot(
+            df_plot["label"],
+            df_plot["avg_target"],
+            color="#00ccff",
+            marker=".",
+            markersize=20,
+            linewidth=3,
+            label="incidence rate per bin" if model_type == "classification" else "mean target value per bin",
+            zorder=10,
+        )
 
-        ax.plot(df_plot['label'], df_plot['global_avg_target'],
-                color="#022252", linestyle='--', linewidth=4,
-                label='average incidence rate' if model_type == "classification" else "global mean target value",
-                zorder=10)
+        ax.plot(
+            df_plot["label"],
+            df_plot["global_avg_target"],
+            color="#022252",
+            linestyle="--",
+            linewidth=4,
+            label="average incidence rate" if model_type == "classification" else "global mean target value",
+            zorder=10,
+        )
 
         # Dummy line to have label on second axis from first
-        ax.plot(np.nan, "#939598", linewidth=6, label='bin size')
+        ax.plot(np.nan, "#939598", linewidth=6, label="bin size")
 
         # Set labels & ticks
-        ax.set_ylabel(
-            'incidence' if model_type == "classification" else "mean target value",
-            fontsize=16
-        )
-        ax.set_xlabel(f'{variable} bins' '', fontsize=16)
+        ax.set_ylabel("incidence" if model_type == "classification" else "mean target value", fontsize=16)
+        ax.set_xlabel(f"{variable} bins" "", fontsize=16)
         ax.xaxis.set_tick_params(labelsize=14)
-        plt.setp(
-            ax.get_xticklabels(),
-            rotation=45,
-            ha="right",
-            rotation_mode="anchor"
-        )
+        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
         ax.yaxis.set_tick_params(labelsize=14)
 
         if model_type == "classification":
             # Mean target values are between 0 and 1 (target incidence rate),
             # so format them as percentages
-            ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05))
-            ax.yaxis.set_major_formatter(
-                FuncFormatter(lambda y, _: f'{y:.1%}'))
+            ax.set_yticks(np.arange(0, max(df_plot["avg_target"]) + 0.05, 0.05))
+            ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{y:.1%}"))
         elif model_type == "regression":
             # If the difference between the highest avg. target of all bins
             # versus the global avg. target AND the difference between the
@@ -223,40 +196,38 @@ def plot_incidence(
             # the bins and versus the global avg. target.
             # (Motivation for the AND above: if on one end there IS enough
             # difference, the effect that we discuss here does not occur.)
-            global_avg_target = max(df_plot['global_avg_target'])  # series of same number, for every bin.
-            if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
-                    and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
-                ax.set_ylim(global_avg_target * 0.75,
-                            global_avg_target * 1.25)
+            global_avg_target = max(df_plot["global_avg_target"])  # series of same number, for every bin.
+            if (np.abs((max(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25) and (
+                np.abs((min(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25
+            ):
+                ax.set_ylim(global_avg_target * 0.75, global_avg_target * 1.25)
 
         # Remove ticks but keep the labels
-        ax.tick_params(axis='both', which='both', length=0)
-        ax.tick_params(axis='y', colors="#00ccff")
-        ax.yaxis.label.set_color('#00ccff')
+        ax.tick_params(axis="both", which="both", length=0)
+        ax.tick_params(axis="y", colors="#00ccff")
+        ax.yaxis.label.set_color("#00ccff")
 
         # -----------------
         # Right Axis - bins
         # -----------------
         ax2 = ax.twinx()
 
-        ax2.bar(df_plot['label'], df_plot['pop_size'],
-                align='center', color="#939598", zorder=1)
+        ax2.bar(df_plot["label"], df_plot["pop_size"], align="center", color="#939598", zorder=1)
 
         # Set labels & ticks
-        ax2.set_xlabel(f'{variable} bins' '', fontsize=16)
+        ax2.set_xlabel(f"{variable} bins" "", fontsize=16)
         ax2.xaxis.set_tick_params(rotation=45, labelsize=14)
 
         ax2.yaxis.set_tick_params(labelsize=14)
-        ax2.yaxis.set_major_formatter(
-            FuncFormatter(lambda y, _: f'{y:.1%}'))
-        ax2.set_ylabel('population size', fontsize=16)
-        ax2.tick_params(axis='y', colors="#939598")
-        ax2.yaxis.label.set_color('#939598')
+        ax2.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{y:.1%}"))
+        ax2.set_ylabel("population size", fontsize=16)
+        ax2.tick_params(axis="y", colors="#939598")
+        ax2.yaxis.label.set_color("#939598")
 
         # Despine & prettify
         sns.despine(ax=ax, right=True, left=True)
         sns.despine(ax=ax2, left=True, right=False)
-        ax2.spines['right'].set_color('white')
+        ax2.spines["right"].set_color("white")
 
         ax2.grid(False)
 
@@ -268,12 +239,12 @@ def plot_incidence(
         fig.suptitle(title, fontsize=22)
         ax.legend(
             frameon=False,
-            bbox_to_anchor=(0., 1.01, 1., .102),
+            bbox_to_anchor=(0.0, 1.01, 1.0, 0.102),
             loc=3,
             ncol=1,
             mode="expand",
-            borderaxespad=0.,
-            prop={"size": 14}
+            borderaxespad=0.0,
+            prop={"size": 14},
         )
 
         # Set order of layers
diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py
index ae91220..19fbf64 100644
--- a/cobra/evaluation/plotting_utils.py
+++ b/cobra/evaluation/plotting_utils.py
@@ -1,6 +1,7 @@
 """Collection of plotting utils."""
 
 # third party imports
+from typing import cast
 import numpy as np
 import pandas as pd
 
@@ -8,18 +9,10 @@
 import seaborn as sns
 
 
-DEFAULT_COLOURS = {
-    "train": "#0099bf",
-    "selection": "#ff9500",
-    "validation": "#8064a2"
-}
+DEFAULT_COLOURS = {"train": "#0099bf", "selection": "#ff9500", "validation": "#8064a2"}
 
 
-def plot_univariate_predictor_quality(
-    df_metric: pd.DataFrame,
-    dim: tuple = (12, 8),
-    path: str = None
-):
+def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None):
     """Plot univariate quality of the predictors.
 
     Parameters
@@ -40,17 +33,14 @@ def plot_univariate_predictor_quality(
         metric = "RMSE"
         ascending = True
 
-    df = (
-        df_metric[df_metric["preselection"]]
-        .sort_values(by=metric+" selection", ascending=ascending)
-    )
+    df = df_metric[df_metric["preselection"]].sort_values(by=metric + " selection", ascending=ascending)
 
     df = pd.melt(
         df,
         id_vars=["predictor"],
-        value_vars=[metric+" train", metric+" selection"],
+        value_vars=[metric + " train", metric + " selection"],
         var_name="split",
-        value_name=metric
+        value_name=metric,
     )
 
     # plot data
@@ -72,11 +62,7 @@ def plot_univariate_predictor_quality(
         plt.show()
 
 
-def plot_correlation_matrix(
-    df_corr: pd.DataFrame,
-    dim: tuple = (12, 8),
-    path: str = None
-):
+def plot_correlation_matrix(df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None):
     """Plot correlation matrix amongst the predictors.
 
     Parameters
@@ -89,8 +75,8 @@ def plot_correlation_matrix(
         Path to store the figure.
     """
     fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-    ax = sns.heatmap(df_corr, cmap='Blues')
-    ax.set_title('Correlation Matrix')
+    ax = sns.heatmap(df_corr, cmap="Blues")
+    ax.set_title("Correlation Matrix")
 
     if path is not None:
         plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -103,7 +89,7 @@ def plot_performance_curves(
     dim: tuple = (12, 8),
     path: str = None,
     colors: dict = None,
-    metric_name: str = None
+    metric_name: str = None,
 ):
     """Plot performance curves for the train-selection-validation sets.
 
@@ -131,40 +117,48 @@ def plot_performance_curves(
             metric_name = "AUC"
         elif model_type == "regression":
             metric_name = "RMSE"
+    metric_name = cast(str, metric_name)
 
     max_metric = np.round(
         max(
-            max(model_performance['train_performance']),
-            max(model_performance['selection_performance']),
-            max(model_performance['validation_performance'])
-        ), 1)
+            max(model_performance["train_performance"]),
+            max(model_performance["selection_performance"]),
+            max(model_performance["validation_performance"]),
+        ),
+        1,
+    )
 
     with plt.style.context("seaborn-whitegrid"):
         fig, ax = plt.subplots(figsize=dim)
 
         plt.plot(
-            model_performance['train_performance'], marker=".",
-            markersize=20, linewidth=3, label="train",
-            color=colors["train"]
+            model_performance["train_performance"],
+            marker=".",
+            markersize=20,
+            linewidth=3,
+            label="train",
+            color=colors["train"],
         )
         plt.plot(
-            model_performance['selection_performance'], marker=".",
-            markersize=20, linewidth=3, label="selection",
-            color=colors["selection"]
+            model_performance["selection_performance"],
+            marker=".",
+            markersize=20,
+            linewidth=3,
+            label="selection",
+            color=colors["selection"],
         )
         plt.plot(
-            model_performance['validation_performance'], marker=".",
-            markersize=20, linewidth=3, label="validation",
-            color=colors["validation"]
+            model_performance["validation_performance"],
+            marker=".",
+            markersize=20,
+            linewidth=3,
+            label="validation",
+            color=colors["validation"],
         )
 
         # Set x- and y-ticks
-        ax.set_xticks(np.arange(len(model_performance['last_added_predictor'])))
-        ax.set_xticklabels(
-            model_performance['last_added_predictor'].tolist(),
-            rotation=40,
-            ha='right'
-        )
+        ax.set_xticks(np.arange(len(model_performance["last_added_predictor"])))
+        ax.set_xticklabels(model_performance["last_added_predictor"].tolist(), rotation=40, ha="right")
 
         if model_type == "classification":
             ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05))
@@ -172,16 +166,13 @@ def plot_performance_curves(
             # In regression, the scale of the y-axis can largely vary depending
             # on the dataset, it is easier to just set the y-axis bounds,
             # but not the tick distance.
-            ax.set_ylim(0, max_metric*1.1)
+            ax.set_ylim(0, max_metric * 1.1)
 
         # Make pretty
-        ax.legend(loc='lower right')
-        fig.suptitle(
-            'Performance curves forward feature selection',
-            fontsize=20
-        )
+        ax.legend(loc="lower right")
+        fig.suptitle("Performance curves forward feature selection", fontsize=20)
         plt.title("Metric: " + metric_name, fontsize=15, loc="left")
-        plt.ylabel('Model performance')
+        plt.ylabel("Model performance")
 
         if path is not None:
             plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -190,10 +181,7 @@ def plot_performance_curves(
 
 
 def plot_variable_importance(
-    df_variable_importance: pd.DataFrame,
-    title: str = None,
-    dim: tuple = (12, 8),
-    path: str = None
+    df_variable_importance: pd.DataFrame, title: str = None, dim: tuple = (12, 8), path: str = None
 ):
     """Plot variable importance of a given model.
 
@@ -210,11 +198,7 @@ def plot_variable_importance(
     """
     with plt.style.context("seaborn-ticks"):
         fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-        ax = sns.barplot(
-            x="importance", y="predictor",
-            data=df_variable_importance,
-            color="cornflowerblue"
-        )
+        ax = sns.barplot(x="importance", y="predictor", data=df_variable_importance, color="cornflowerblue")
         if title:
             ax.set_title(title)
         else:
diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py
index 288a2c4..c4d2a89 100644
--- a/cobra/model_building/__init__.py
+++ b/cobra/model_building/__init__.py
@@ -7,9 +7,11 @@
 from .models import LogisticRegressionModel, LinearRegressionModel
 from .forward_selection import ForwardFeatureSelection
 
-__all__ = ['compute_univariate_preselection',
-           'get_preselected_predictors',
-           'compute_correlations',
-           'LogisticRegressionModel',
-           'LinearRegressionModel',
-           'ForwardFeatureSelection']
+__all__ = [
+    "compute_univariate_preselection",
+    "get_preselected_predictors",
+    "compute_correlations",
+    "LogisticRegressionModel",
+    "LinearRegressionModel",
+    "ForwardFeatureSelection",
+]
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index 9b897d9..ee75b02 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -1,7 +1,7 @@
 """Feature forward selection."""
 
 import logging
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Set, Union, cast
 
 import pandas as pd
 from tqdm.auto import tqdm
@@ -15,7 +15,7 @@
 DEFAULT_FORCED_PREDICTORS = []
 DEFAULT_EXCLUDED_PREDICTORS = []
 
-Model = Union[LinearRegressionModel, LogisticRegressionModel, None]
+Model = Union[LinearRegressionModel, LogisticRegressionModel]
 
 
 class ForwardFeatureSelection:
@@ -43,12 +43,7 @@ class ForwardFeatureSelection:
         List of fitted models.
     """
 
-    def __init__(
-        self,
-        model_type: str = "classification",
-        max_predictors: int = 50,
-        pos_only: bool = True
-    ):
+    def __init__(self, model_type: str = "classification", max_predictors: int = 50, pos_only: bool = True):
         """Initialize the ForwardFeatureSelection class."""
         self.model_type = model_type
         if model_type == "classification":
@@ -59,7 +54,7 @@ def __init__(
         self.max_predictors = max_predictors
         self.pos_only = pos_only
 
-        self._fitted_models = []
+        self._fitted_models: list[Model] = []
 
     def get_model_from_step(self, step: int) -> Model:
         """Get fitted model from a particular step.
@@ -80,15 +75,13 @@ def get_model_from_step(self, step: int) -> Model:
             In case step is larger than the number of available models.
         """
         if len(self._fitted_models) <= step:
-            raise ValueError(
-                f"No model available for step {step}. "
-                "The first step starts from index 0."
-            )
+            raise ValueError(f"No model available for step {step}. " "The first step starts from index 0.")
 
         return self._fitted_models[step]
 
     def compute_model_performances(
-        self, data: pd.DataFrame,
+        self,
+        data: pd.DataFrame,
         target_column_name: str,
         splits: list = None,
         metric: Optional[Callable] = None,
@@ -125,29 +118,25 @@ def compute_model_performances(
         """
         splits = splits or DEFAULT_SPLIT_NAMES
         results = []
-        predictor_set = set([])
+        predictor_set: Set[str] = set()
 
         for model in self._fitted_models:
-            last_added_predictor = (
-                set(model.predictors)
-                .difference(predictor_set)
-            )
-            tmp = {
-                "predictors": model.predictors,
-                "last_added_predictor": list(last_added_predictor)[0]
-            }
+            last_added_predictor = set(model.predictors).difference(predictor_set)
+            tmp = {"predictors": model.predictors, "last_added_predictor": list(last_added_predictor)[0]}
 
             # Evaluate model on each dataset split,
             # e.g. train-selection-validation
-            tmp.update({
-                f"{split}_performance": model.evaluate(
-                    data[data["split"] == split],
-                    data[data["split"] == split][target_column_name],
-                    split=split,  # parameter used for caching
-                    metric=metric
-                )
-                for split in splits
-            })
+            tmp.update(
+                {
+                    f"{split}_performance": model.evaluate(
+                        data[data["split"] == split],
+                        data[data["split"] == split][target_column_name],
+                        split=split,  # parameter used for caching
+                        metric=metric,
+                    )
+                    for split in splits
+                }
+            )
 
             results.append(tmp)
             predictor_set = predictor_set.union(set(model.predictors))
@@ -158,11 +147,12 @@ def compute_model_performances(
         return df
 
     def fit(
-        self, train_data: pd.DataFrame,
+        self,
+        train_data: pd.DataFrame,
         target_column_name: str,
         predictors: list,
         forced_predictors: list = None,
-        excluded_predictors: list = None
+        excluded_predictors: list = None,
     ):
         """Fit the forward feature selection estimator.
 
@@ -190,54 +180,34 @@ def fit(
             number of allowed predictors in the model.
         """
         assert "split" in train_data.columns, "The train_data input df does not include a split column."
-        assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
-            "The train_data input df does not include a 'train' and 'selection' split."
+        assert (
+            len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0
+        ), "The train_data input df does not include a 'train' and 'selection' split."
 
         # remove excluded predictors from predictor lists
         forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS
         excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS
         filtered_predictors = [
-            var for var in predictors
-            if (
-                var not in excluded_predictors
-                and var not in forced_predictors
-            )
+            var for var in predictors if (var not in excluded_predictors and var not in forced_predictors)
         ]
 
         # checks on predictor lists and self.max_predictors attr
         if len(forced_predictors) > self.max_predictors:
-            raise ValueError(
-                "Size of forced_predictors cannot be bigger than "
-                "max_predictors."
-            )
+            raise ValueError("Size of forced_predictors cannot be bigger than " "max_predictors.")
         elif len(forced_predictors) == self.max_predictors:
-            log.info(
-                "Size of forced_predictors equals max_predictors "
-                "only one model will be trained..."
-            )
+            log.info("Size of forced_predictors equals max_predictors " "only one model will be trained...")
             # train model with all forced_predictors (only)
             self._fitted_models.append(
-                self._train_model(
-                    train_data[train_data["split"] == "train"],
-                    target_column_name,
-                    forced_predictors
-                )
+                self._train_model(train_data[train_data["split"] == "train"], target_column_name, forced_predictors)
             )
 
         else:
             self._fitted_models = self._forward_selection(
-                train_data,
-                target_column_name,
-                filtered_predictors,
-                forced_predictors
+                train_data, target_column_name, filtered_predictors, forced_predictors
             )
 
     def _forward_selection(
-        self,
-        train_data: pd.DataFrame,
-        target_column_name: str,
-        predictors: list,
-        forced_predictors: list = None
+        self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list = None
     ) -> list[Model]:
         """Perform the forward feature selection algorithm.
 
@@ -264,41 +234,25 @@ def _forward_selection(
             number of predictors minus one (as indices start from 0).
         """
         forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS
-        fitted_models = []
-        current_predictors = []
+        fitted_models: list[Model] = []
+        current_predictors: list[str] = []
 
-        max_steps = 1 + min(self.max_predictors,
-                            len(predictors) + len(forced_predictors))
+        max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors))
 
-        for step in tqdm(
-            range(1, max_steps),
-            desc="Sequentially adding best predictor..."
-        ):
+        for step in tqdm(range(1, max_steps), desc="Sequentially adding best predictor..."):
             if step <= len(forced_predictors):
                 # first, we go through the forced predictors
-                candidate_predictors = [
-                    var for var in forced_predictors
-                    if var not in current_predictors
-                ]
+                candidate_predictors = [var for var in forced_predictors if var not in current_predictors]
             else:
                 candidate_predictors = [
-                    var for var in (predictors + forced_predictors)
-                    if var not in current_predictors
+                    var for var in (predictors + forced_predictors) if var not in current_predictors
                 ]
 
-            model = self._find_next_best_model(
-                train_data,
-                target_column_name,
-                candidate_predictors,
-                current_predictors
-            )
+            model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors)
 
             if model is not None:
                 # Add new model predictors to the list of current predictors
-                current_predictors = list(
-                    set(current_predictors)
-                    .union(set(model.predictors))
-                )
+                current_predictors = list(set(current_predictors).union(set(model.predictors)))
 
                 fitted_models.append(model)
             # else:
@@ -313,11 +267,7 @@ def _forward_selection(
         return fitted_models
 
     def _find_next_best_model(
-        self,
-        train_data: pd.DataFrame,
-        target_column_name: str,
-        candidate_predictors: list,
-        current_predictors: list
+        self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list
     ) -> Model:
         """
         Find the next best model with candidate predictors.
@@ -367,20 +317,11 @@ def _find_next_best_model(
 
         for pred in candidate_predictors:
             # Train a model with an additional predictor
-            model = self._train_model(
-                fit_data,
-                target_column_name,
-                (current_predictors + [pred])
-            )
+            model = self._train_model(fit_data, target_column_name, (current_predictors + [pred]))
 
             # Evaluate the model
-            performance = (
-                model
-                .evaluate(
-                    sel_data[current_predictors + [pred]],
-                    sel_data[target_column_name],
-                    split="selection"
-                )
+            performance = model.evaluate(
+                sel_data[current_predictors + [pred]], sel_data[target_column_name], split="selection"
             )
 
             if self.pos_only and (not (model.get_coef() >= 0).all()):
@@ -388,23 +329,16 @@ def _find_next_best_model(
 
             # Check if the model is better than the current best model
             # and if it is, replace the current best.
-            if self.MLModel == LogisticRegressionModel \
-                    and performance > best_performance:  # AUC metric is used
+            if self.MLModel == LogisticRegressionModel and performance > best_performance:  # AUC metric is used
                 best_performance = performance
                 best_model = model
-            elif self.MLModel == LinearRegressionModel \
-                    and performance < best_performance:  # RMSE metric is used
+            elif self.MLModel == LinearRegressionModel and performance < best_performance:  # RMSE metric is used
                 best_performance = performance
                 best_model = model
 
-        return best_model
+        return cast(Model, best_model)
 
-    def _train_model(
-        self,
-        train_data: pd.DataFrame,
-        target_column_name: str,
-        predictors: list
-    ) -> Model:
+    def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list) -> Model:
         """Train the model with a given set of predictors.
 
         Parameters
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 58571b3..408ead4 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -33,8 +33,7 @@ class LogisticRegressionModel:
 
     def __init__(self):
         """Initialize the LogisticRegressionModel class."""
-        self.logit = LogisticRegression(fit_intercept=True, C=1e9,
-                                        solver='liblinear', random_state=42)
+        self.logit = LogisticRegression(fit_intercept=True, C=1e9, solver="liblinear", random_state=42)
         self._is_fitted = False
         # placeholder to keep track of a list of predictors
         self.predictors = []
@@ -52,16 +51,18 @@ def serialize(self) -> dict:
             "meta": "logistic-regression",
             "predictors": self.predictors,
             "_eval_metrics_by_split": self._eval_metrics_by_split,
-            "params": self.logit.get_params()
+            "params": self.logit.get_params(),
         }
 
         if self._is_fitted:
-            serialized_model.update({
-                "classes_": self.logit.classes_.tolist(),
-                "coef_": self.logit.coef_.tolist(),
-                "intercept_": self.logit.intercept_.tolist(),
-                "n_iter_": self.logit.n_iter_.tolist(),
-            })
+            serialized_model.update(
+                {
+                    "classes_": self.logit.classes_.tolist(),
+                    "coef_": self.logit.coef_.tolist(),
+                    "intercept_": self.logit.intercept_.tolist(),
+                    "n_iter_": self.logit.n_iter_.tolist(),
+                }
+            )
 
         return serialized_model
 
@@ -90,7 +91,7 @@ def deserialize(self, model_dict: dict):
         self.predictors = model_dict["predictors"]
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
-    def get_coef(self) -> np.array:
+    def get_coef(self) -> np.ndarray:
         """Return the model coefficients.
 
         Returns
@@ -151,11 +152,7 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         # ensure we have the proper predictors and the proper order
         return self.logit.predict_proba(X[self.predictors])[:, 1]
 
-    def evaluate(
-        self, X: pd.DataFrame, y: pd.Series,
-        split: str = None,
-        metric: Optional[Callable] = None
-    ) -> float:
+    def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float:
         """
         Evaluate the model on a given dataset (X, y).
 
@@ -188,7 +185,7 @@ def evaluate(
             y_pred = self.score_model(X)
 
             fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred)
-            cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds))
+            cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
             y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred])
 
             performance = metric(y_true=y, y_pred=y_pred_b)
@@ -222,26 +219,18 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         y_pred = self.score_model(data)
 
         importance_by_variable = {
-            utils.clean_predictor_name(predictor): stats.pearsonr(
-                data[predictor],
-                y_pred
-                )[0]
+            utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0]
             for predictor in self.predictors
         }
 
-        df = pd.DataFrame.from_dict(importance_by_variable,
-                                    orient="index").reset_index()
+        df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index()
         df.columns = ["predictor", "importance"]
 
-        return (
-            df.sort_values(by="importance", ascending=False)
-            .reset_index(drop=True)
-        )
+        return df.sort_values(by="importance", ascending=False).reset_index(drop=True)
 
     def _is_valid_dict(self, model_dict: dict) -> bool:
         """Check if the model dictionary is valid."""
-        if ("meta" not in model_dict
-                or model_dict["meta"] != "logistic-regression"):
+        if "meta" not in model_dict or model_dict["meta"] != "logistic-regression":
             return False
 
         attr = ["classes_", "coef_", "intercept_", "n_iter_", "predictors"]
@@ -249,8 +238,7 @@ def _is_valid_dict(self, model_dict: dict) -> bool:
             if not (key in model_dict or type(model_dict[key]) != list):
                 return False
 
-        if ("params" not in model_dict
-                or "_eval_metrics_by_split" not in model_dict):
+        if "params" not in model_dict or "_eval_metrics_by_split" not in model_dict:
             return False
 
         return True
@@ -292,14 +280,13 @@ def serialize(self) -> dict:
             "meta": "linear-regression",
             "predictors": self.predictors,
             "_eval_metrics_by_split": self._eval_metrics_by_split,
-            "params": self.linear.get_params()
+            "params": self.linear.get_params(),
         }
 
         if self._is_fitted:
-            serialized_model.update({
-                "coef_": self.linear.coef_.tolist(),
-                "intercept_": self.linear.intercept_.tolist()
-            })
+            serialized_model.update(
+                {"coef_": self.linear.coef_.tolist(), "intercept_": self.linear.intercept_.tolist()}
+            )
 
         return serialized_model
 
@@ -326,7 +313,7 @@ def deserialize(self, model_dict: dict):
         self.predictors = model_dict["predictors"]
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
-    def get_coef(self) -> np.array:
+    def get_coef(self) -> np.ndarray:
         """Return the model coefficients.
 
         Returns
@@ -387,11 +374,7 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         # ensure we have the proper predictors and the proper order
         return self.linear.predict(X[self.predictors])
 
-    def evaluate(
-        self, X: pd.DataFrame, y: pd.Series,
-        split: str = None,
-        metric: Optional[Callable] = None
-    ) -> float:
+    def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float:
         """Evaluate the model on a given dataset (X, y).
 
         The optional split
@@ -451,29 +434,19 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         y_pred = self.score_model(data)
 
         importance_by_variable = {
-            utils.clean_predictor_name(predictor): stats.pearsonr(
-                data[predictor],
-                y_pred
-                )[0]
+            utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0]
             for predictor in self.predictors
         }
 
-        df = pd.DataFrame.from_dict(
-            importance_by_variable,
-            orient="index"
-        ).reset_index()
+        df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index()
         df.columns = ["predictor", "importance"]
 
-        return (
-            df.sort_values(by="importance", ascending=False)
-            .reset_index(drop=True)
-        )
+        return df.sort_values(by="importance", ascending=False).reset_index(drop=True)
 
     @staticmethod
     def _is_valid_dict(model_dict: dict) -> bool:
         """Check if the model dictionary is valid."""
-        if ("meta" not in model_dict
-                or model_dict["meta"] != "linear-regression"):
+        if "meta" not in model_dict or model_dict["meta"] != "linear-regression":
             return False
 
         attr = ["coef_", "intercept_", "predictors"]
@@ -481,8 +454,7 @@ def _is_valid_dict(model_dict: dict) -> bool:
             if not (key in model_dict or not isinstance(model_dict[key], list)):
                 return False
 
-        if ("params" not in model_dict
-                or "_eval_metrics_by_split" not in model_dict):
+        if "params" not in model_dict or "_eval_metrics_by_split" not in model_dict:
             return False
 
         return True
diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index e4d1ff6..2d90b48 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -14,7 +14,7 @@ def compute_univariate_preselection(
     model_type: str = "classification",
     preselect_auc_threshold: float = 0.053,
     preselect_rmse_threshold: float = 5,
-    preselect_overtrain_threshold: float = 0.05
+    preselect_overtrain_threshold: float = 0.05,
 ) -> pd.DataFrame:
     """Perform a preselection of predictors.
 
@@ -74,21 +74,15 @@ def compute_univariate_preselection(
             cleaned_predictor = utils.clean_predictor_name(predictor)
 
             auc_train = roc_auc_score(
-                y_true=target_enc_train_data[target_column],
-                y_score=target_enc_train_data[predictor])
+                y_true=target_enc_train_data[target_column], y_score=target_enc_train_data[predictor]
+            )
 
             auc_selection = roc_auc_score(
-                y_true=target_enc_selection_data[target_column],
-                y_score=target_enc_selection_data[predictor])
-
-            result.append(
-                {
-                    "predictor": cleaned_predictor,
-                    "AUC train": auc_train,
-                    "AUC selection": auc_selection
-                }
+                y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor]
             )
 
+            result.append({"predictor": cleaned_predictor, "AUC train": auc_train, "AUC selection": auc_selection})
+
         df_auc = pd.DataFrame(result)
 
         # Filter based on min. AUC
@@ -107,22 +101,18 @@ def compute_univariate_preselection(
         for predictor in predictors:
             cleaned_predictor = utils.clean_predictor_name(predictor)
 
-            rmse_train = sqrt(mean_squared_error(
-                y_true=target_enc_train_data[target_column],
-                y_pred=target_enc_train_data[predictor]))
-
-            rmse_selection = sqrt(mean_squared_error(
-                y_true=target_enc_selection_data[target_column],
-                y_pred=target_enc_selection_data[predictor]))
+            rmse_train = sqrt(
+                mean_squared_error(y_true=target_enc_train_data[target_column], y_pred=target_enc_train_data[predictor])
+            )
 
-            result.append(
-                {
-                    "predictor": cleaned_predictor,
-                    "RMSE train": rmse_train,
-                    "RMSE selection": rmse_selection
-                }
+            rmse_selection = sqrt(
+                mean_squared_error(
+                    y_true=target_enc_selection_data[target_column], y_pred=target_enc_selection_data[predictor]
+                )
             )
 
+            result.append({"predictor": cleaned_predictor, "RMSE train": rmse_train, "RMSE selection": rmse_selection})
+
         df_rmse = pd.DataFrame(result)
 
         # Filter based on max. RMSE
@@ -157,9 +147,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     """
     if "AUC selection" in df_metric.columns:
         predictor_list = (
-            df_metric[df_metric["preselection"]]
-            .sort_values(by="AUC selection", ascending=False)
-            .predictor.tolist()
+            df_metric[df_metric["preselection"]].sort_values(by="AUC selection", ascending=False).predictor.tolist()
         )
     elif "RMSE selection" in df_metric.columns:
         predictor_list = (
@@ -171,10 +159,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     return [col + "_enc" for col in predictor_list]
 
 
-def compute_correlations(
-    target_enc_train_data: pd.DataFrame,
-    predictors: list
-) -> pd.DataFrame:
+def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame:
     """Compute the correlations amongst the predictors in the DataFrame.
 
     Parameters
@@ -192,10 +177,7 @@ def compute_correlations(
     """
     correlations = target_enc_train_data[predictors].corr()
 
-    predictors_cleaned = [
-        utils.clean_predictor_name(predictor)
-        for predictor in predictors
-    ]
+    predictors_cleaned = [utils.clean_predictor_name(predictor) for predictor in predictors]
 
     # Change index and columns with the cleaned version of the predictors
     # e.g. change "var1_enc" with "var1"
diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py
index b72d1a4..55e036b 100644
--- a/cobra/preprocessing/__init__.py
+++ b/cobra/preprocessing/__init__.py
@@ -5,7 +5,4 @@
 from .categorical_data_processor import CategoricalDataProcessor
 from .preprocessor import PreProcessor
 
-__all__ = ['KBinsDiscretizer',
-           'TargetEncoder',
-           'CategoricalDataProcessor',
-           'PreProcessor']
+__all__ = ["KBinsDiscretizer", "TargetEncoder", "CategoricalDataProcessor", "PreProcessor"]
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index 9d2f263..6632720 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -2,7 +2,7 @@
 
 # standard lib imports
 import re
-from typing import Optional
+from typing import Any, Optional, Set, Union
 import logging
 
 # third party imports
@@ -62,9 +62,14 @@ class CategoricalDataProcessor(BaseEstimator):
     """
 
     valid_keys = [
-        "model_type", "regroup", "regroup_name", "keep_missing",
-        "category_size_threshold", "p_value_threshold",
-        "scale_contingency_table", "forced_categories"
+        "model_type",
+        "regroup",
+        "regroup_name",
+        "keep_missing",
+        "category_size_threshold",
+        "p_value_threshold",
+        "scale_contingency_table",
+        "forced_categories",
     ]
 
     def __init__(
@@ -76,7 +81,7 @@ def __init__(
         category_size_threshold: int = 5,
         p_value_threshold: float = 0.001,
         scale_contingency_table: bool = True,
-        forced_categories: dict = {}
+        forced_categories: dict = {},
     ):
         """Initialize the CategoricalDataProcessor."""
         if model_type not in ["classification", "regression"]:
@@ -95,7 +100,7 @@ def __init__(
         self.forced_categories = forced_categories
 
         # dict to store fitted output in
-        self._cleaned_categories_by_column = {}
+        self._cleaned_categories_by_column: dict[str, Set[Any]] = {}
 
     def attributes_to_dict(self) -> dict:
         """Return the attributes of CategoricalDataProcessor as a dictionary.
@@ -109,8 +114,7 @@ def attributes_to_dict(self) -> dict:
         params = self.get_params()
 
         params["_cleaned_categories_by_column"] = {
-            key: list(value)
-            for key, value in self._cleaned_categories_by_column.items()
+            key: list(value) for key, value in self._cleaned_categories_by_column.items()
         }
 
         return params
@@ -134,8 +138,7 @@ def set_attributes_from_dict(self, params: dict):
         if type(_fitted_output) != dict:
             raise ValueError(
                 "_cleaned_categories_by_column is expected to "
-                "be a dict but is of type {} instead"
-                .format(type(_fitted_output))
+                "be a dict but is of type {} instead".format(type(_fitted_output))
             )
 
         # Clean out params dictionary to remove unknown keys (for safety!)
@@ -145,18 +148,11 @@ def set_attributes_from_dict(self, params: dict):
         # of the following method from BaseEstimator:
         self.set_params(**params)
 
-        self._cleaned_categories_by_column = {
-            key: set(value) for key, value in _fitted_output.items()
-        }
+        self._cleaned_categories_by_column = {key: set(value) for key, value in _fitted_output.items()}
 
         return self
 
-    def fit(
-        self,
-        data: pd.DataFrame,
-        column_names: list,
-        target_column: str
-    ):
+    def fit(self, data: pd.DataFrame, column_names: list, target_column: str):
         """Fit the CategoricalDataProcessor.
 
         Parameters
@@ -174,15 +170,9 @@ def fit(
             log.info("regroup was set to False, so no fitting is required")
             return None
 
-        for column_name in tqdm(
-            column_names,
-            desc="Fitting category regrouping..."
-        ):
+        for column_name in tqdm(column_names, desc="Fitting category regrouping..."):
             if column_name not in data.columns:
-                log.warning(
-                    "DataFrame has no column '{}', so it will be "
-                    "skipped in fitting" .format(column_name)
-                )
+                log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name))
                 continue
 
             cleaned_cats = self._fit_column(data, column_name, target_column)
@@ -194,8 +184,7 @@ def fit(
             # Add to _cleaned_categories_by_column for later use
             self._cleaned_categories_by_column[column_name] = cleaned_cats
 
-    def _fit_column(self, data: pd.DataFrame, column_name: str,
-                    target_column) -> set:
+    def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set:
         """
         Fit all necessary columns into "Other".
 
@@ -218,8 +207,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
         model_type = self.model_type
 
         if len(data[column_name].unique()) == 1:
-            log.warning(f"Predictor {column_name} is constant"
-                        " and will be ignored in computation.")
+            log.warning(f"Predictor {column_name} is constant" " and will be ignored in computation.")
             return set(data[column_name].unique())
 
         y = data[target_column]
@@ -228,48 +216,28 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
         else:
             incidence = None
 
-        combined_categories = set()
+        combined_categories: Set[str] = set()
 
         # replace missings and get unique categories as a list
-        X = (
-            CategoricalDataProcessor
-            ._replace_missings(data[column_name])
-            .astype(object)
-        )
+        X = CategoricalDataProcessor._replace_missings(data[column_name]).astype(object)
 
         unique_categories = list(X.unique())
 
         # do not merge categories in case of dummies, i.e. 0 and 1
         # (and possibly "Missing")
-        if (len(unique_categories) == 2
-            or (len(unique_categories) == 3
-                and "Missing" in unique_categories)):
+        if len(unique_categories) == 2 or (len(unique_categories) == 3 and "Missing" in unique_categories):
             return set(unique_categories)
 
         # get small categories and add them to the merged category list
         # does not apply incidence factor when model_type = "regression"
-        small_categories = (
-            CategoricalDataProcessor
-            ._get_small_categories(
-                X,
-                incidence,
-                self.category_size_threshold
-            )
-        )
+        small_categories = CategoricalDataProcessor._get_small_categories(X, incidence, self.category_size_threshold)
         combined_categories = combined_categories.union(small_categories)
 
         for category in unique_categories:
             if category in small_categories:
                 continue
 
-            pval = (
-                CategoricalDataProcessor
-                ._compute_p_value(
-                    X, y, category,
-                    model_type,
-                    self.scale_contingency_table
-                )
-            )
+            pval = CategoricalDataProcessor._compute_p_value(X, y, category, model_type, self.scale_contingency_table)
 
             # if not significant, add it to the list
             if pval > self.p_value_threshold:
@@ -281,11 +249,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
 
         return set(unique_categories).difference(combined_categories)
 
-    def transform(
-        self,
-        data: pd.DataFrame,
-        column_names: list
-    ) -> pd.DataFrame:
+    def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
         """Transform the data.
 
         Parameters
@@ -302,10 +266,7 @@ def transform(
             Data with additional transformed variables.
         """
         if self.regroup and len(self._cleaned_categories_by_column) == 0:
-            msg = (
-                "{} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
-            )
+            msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column_name in column_names:
@@ -318,10 +279,7 @@ def transform(
 
         return data
 
-    def _transform_column(
-        self, data: pd.DataFrame,
-        column_name: str
-    ) -> pd.DataFrame:
+    def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame:
         """Create an additional column which combines categories into "Other".
 
         Parameters
@@ -340,13 +298,7 @@ def _transform_column(
         data.loc[:, column_name_clean] = data[column_name].astype(object)
 
         # Fill missings first
-        data.loc[:, column_name_clean] = (
-            CategoricalDataProcessor
-            ._replace_missings(
-                data,
-                column_name_clean
-            )
-        )
+        data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings(data, column_name_clean)
 
         if self.regroup:
             categories = self._cleaned_categories_by_column.get(column_name)
@@ -355,17 +307,11 @@ def _transform_column(
                 # Log warning if categories is None, which indicates it is
                 # not in fitted output
                 if categories is None:
-                    log.warning("Column '{}' is not in fitted output "
-                                "and will be skipped".format(column_name))
+                    log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name))
                 return data
 
-            data.loc[:, column_name_clean] = (
-                CategoricalDataProcessor
-                ._replace_categories(
-                    data[column_name_clean],
-                    categories,
-                    self.regroup_name
-                )
+            data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_categories(
+                data[column_name_clean], categories, self.regroup_name
             )
 
         # change data to categorical
@@ -373,12 +319,7 @@ def _transform_column(
 
         return data
 
-    def fit_transform(
-        self,
-        data: pd.DataFrame,
-        column_names: list,
-        target_column: str
-    ) -> pd.DataFrame:
+    def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame:
         """Fit and transform the data.
 
         Parameters
@@ -400,11 +341,7 @@ def fit_transform(
         return self.transform(data, column_names)
 
     @staticmethod
-    def _get_small_categories(
-        predictor_series: pd.Series,
-        incidence: float,
-        category_size_threshold: int
-    ) -> set:
+    def _get_small_categories(predictor_series: pd.Series, incidence: float, category_size_threshold: int) -> set:
         """
         Fetch categories with a size below a certain threshold.
 
@@ -431,14 +368,11 @@ def _get_small_categories(
             factor = 1
 
         # Get all categories with a count below a threshold
-        bool_mask = (category_counts*factor) <= category_size_threshold
+        bool_mask = (category_counts * factor) <= category_size_threshold
         return set(category_counts[bool_mask].index.tolist())
 
     @staticmethod
-    def _replace_missings(
-        data: pd.DataFrame,
-        column_names: Optional[list] = None
-    ) -> pd.DataFrame:
+    def _replace_missings(data: pd.DataFrame, column_names: Optional[Union[list[str], str]] = None) -> pd.DataFrame:
         """Replace missing values (incl. empty strings).
 
         Parameters
@@ -469,11 +403,7 @@ def _replace_missings(
 
     @staticmethod
     def _compute_p_value(
-        X: pd.Series,
-        y: pd.Series,
-        category: str,
-        model_type: str,
-        scale_contingency_table: bool
+        X: pd.Series, y: pd.Series, category: str, model_type: str, scale_contingency_table: bool
     ) -> float:
         """
         Calculate p-value.
@@ -509,34 +439,26 @@ def _compute_p_value(
         df["other_categories"] = np.where(X == category, 0, 1)
 
         if model_type == "classification":
-            contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"],
-                                            margins=False)
+            contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], margins=False)
 
             # if true, we scale the "other" categories
             if scale_contingency_table:
                 size_other_cats = contingency_table.iloc[1].sum()
                 incidence_mean = y.mean()
 
-                contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
+                contingency_table.iloc[1, 0] = (1 - incidence_mean) * size_other_cats
                 contingency_table.iloc[1, 1] = incidence_mean * size_other_cats
                 contingency_table = contingency_table.values.astype(np.int64)
 
             pval = stats.chi2_contingency(contingency_table, correction=False)[1]
 
         elif model_type == "regression":
-            pval = stats.kruskal(
-                df.y[df.other_categories == 0],
-                df.y[df.other_categories == 1]
-            )[1]
+            pval = stats.kruskal(df.y[df.other_categories == 0], df.y[df.other_categories == 1])[1]
 
         return pval
 
     @staticmethod
-    def _replace_categories(
-        data: pd.Series,
-        categories: set,
-        replace_with: str
-    ) -> pd.Series:
+    def _replace_categories(data: pd.Series, categories: set, replace_with: str) -> pd.Series:
         """
         Replace categories in set with "Other".
 
@@ -557,5 +479,4 @@ def _replace_categories(
         pd.Series
             Series with replaced categories.
         """
-        return data.apply(
-            lambda x: str(x) if x in categories else replace_with)
+        return data.apply(lambda x: str(x) if x in categories else replace_with)
diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py
index 7621ac8..0ad6265 100644
--- a/cobra/preprocessing/kbins_discretizer.py
+++ b/cobra/preprocessing/kbins_discretizer.py
@@ -2,7 +2,7 @@
 # standard lib imports
 from copy import deepcopy
 from this import d
-from typing import List
+from typing import Dict, List, Optional, Union
 import numbers
 import logging
 import math
@@ -65,18 +65,24 @@ class KBinsDiscretizer(BaseEstimator):
 
     valid_strategies = ("uniform", "quantile")
     valid_keys = [
-        "n_bins", "strategy", "closed", "auto_adapt_bins",
-        "starting_precision", "label_format",
-        "change_endpoint_format"
+        "n_bins",
+        "strategy",
+        "closed",
+        "auto_adapt_bins",
+        "starting_precision",
+        "label_format",
+        "change_endpoint_format",
     ]
 
     def __init__(
-        self, n_bins: int = 10, strategy: str = "quantile",
+        self,
+        n_bins: int = 10,
+        strategy: str = "quantile",
         closed: str = "right",
         auto_adapt_bins: bool = False,
         starting_precision: int = 0,
         label_format: str = "{} - {}",
-        change_endpoint_format: bool = False
+        change_endpoint_format: bool = False,
     ):
         """Initialize the KBinsDiscretizer."""
         # validate number of bins
@@ -108,18 +114,14 @@ def _validate_n_bins(self, n_bins: int):
         """
         if not isinstance(n_bins, numbers.Integral):
             raise ValueError(
-                "{} received an invalid n_bins type. Received {}, expected int."
-                .format(
-                    KBinsDiscretizer.__name__,
-                    type(n_bins).__name__
+                "{} received an invalid n_bins type. Received {}, expected int.".format(
+                    KBinsDiscretizer.__name__, type(n_bins).__name__
                 )
             )
         if n_bins < 2:
             raise ValueError(
-                "{} received an invalid number of bins. Received {}, expected at least 2."
-                .format(
-                    KBinsDiscretizer.__name__,
-                    n_bins
+                "{} received an invalid number of bins. Received {}, expected at least 2.".format(
+                    KBinsDiscretizer.__name__, n_bins
                 )
             )
 
@@ -135,8 +137,7 @@ def attributes_to_dict(self) -> dict:
         params = self.get_params()
 
         params["_bins_by_column"] = {
-            key: [list(tup) for tup in value] if value else None
-            for key, value in self._bins_by_column.items()
+            key: [list(tup) for tup in value] if value else None for key, value in self._bins_by_column.items()
         }
 
         return params
@@ -159,8 +160,7 @@ def set_attributes_from_dict(self, params: dict):
 
         if type(_bins_by_column) != dict:
             raise ValueError(
-                "_bins_by_column is expected to be a dict but is of type {} instead"
-                .format(type(_bins_by_column))
+                "_bins_by_column is expected to be a dict but is of type {} instead".format(type(_bins_by_column))
             )
 
         # Clean out params dictionary to remove unknown keys (for safety!)
@@ -171,8 +171,7 @@ def set_attributes_from_dict(self, params: dict):
         self.set_params(**params)
 
         self._bins_by_column = {
-            key: ([tuple(v) for v in value] if value else None)
-            for key, value in _bins_by_column.items()
+            key: ([tuple(v) for v in value] if value else None) for key, value in _bins_by_column.items()
         }
 
         return self
@@ -189,22 +188,14 @@ def fit(self, data: pd.DataFrame, column_names: list):
         """
         if self.strategy not in self.valid_strategies:
             raise ValueError(
-                "{}: valid options for 'strategy' are {}. Got strategy={!r} instead."
-                .format(
-                    KBinsDiscretizer.__name__,
-                    self.valid_strategies, self.strategy
+                "{}: valid options for 'strategy' are {}. Got strategy={!r} instead.".format(
+                    KBinsDiscretizer.__name__, self.valid_strategies, self.strategy
                 )
             )
 
-        for column_name in tqdm(
-            column_names, desc="Computing discretization bins..."
-        ):
+        for column_name in tqdm(column_names, desc="Computing discretization bins..."):
             if column_name not in data.columns:
-                log.warning(
-                    "DataFrame has no column '{}', so it will be "
-                    "skipped in fitting"
-                    .format(column_name)
-                )
+                log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name))
                 continue
 
             bins = self._fit_column(data, column_name)
@@ -212,11 +203,7 @@ def fit(self, data: pd.DataFrame, column_names: list):
             # Add to bins_by_column for later use
             self._bins_by_column[column_name] = bins
 
-    def _fit_column(
-        self,
-        data: pd.DataFrame,
-        column_name: str
-    ) -> List[tuple]:
+    def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tuple]]:
         """Compute bins for a specific column in data.
 
         Parameters
@@ -234,14 +221,10 @@ def _fit_column(
         col_min, col_max = data[column_name].min(), data[column_name].max()
 
         if col_min == col_max:
-            log.warning(
-                "Predictor '{}' is constant and will be ignored in computation"
-                .format(column_name)
-            )
+            log.warning("Predictor '{}' is constant and will be ignored in computation".format(column_name))
             return None
 
-        prop_inf = (np.sum(np.isinf(data[column_name]))
-                    / data[column_name].shape[0])
+        prop_inf = np.sum(np.isinf(data[column_name])) / data[column_name].shape[0]
 
         if prop_inf > 0:
             log.warning(
@@ -254,31 +237,19 @@ def _fit_column(
         prop_nan = data[column_name].isna().sum() / data[column_name].shape[0]
 
         if prop_nan >= 0.99:
-            log.warning(
-                f"Column {column_name} is"
-                f" {prop_nan:.1%}% NaNs, "
-                f"consider dropping or transforming it."
-            )
+            log.warning(f"Column {column_name} is" f" {prop_nan:.1%}% NaNs, " f"consider dropping or transforming it.")
 
         n_bins = self.n_bins
         if self.auto_adapt_bins:
             size = len(data.index)
-            missing_pct = data[column_name].isnull().sum()/size
+            missing_pct = data[column_name].isnull().sum() / size
             n_bins = int(max(round((1 - missing_pct) * n_bins), 2))
 
-        bin_edges = self._compute_bin_edges(
-            data,
-            column_name,
-            n_bins,
-            col_min,
-            col_max
-        )
+        bin_edges = self._compute_bin_edges(data, column_name, n_bins, col_min, col_max)
 
         if len(bin_edges) < 3:
             log.warning(
-                "Only 1 bin was found for predictor '{}' so it will "
-                "be ignored in computation"
-                .format(column_name)
+                "Only 1 bin was found for predictor '{}' so it will " "be ignored in computation".format(column_name)
             )
             return None
 
@@ -286,17 +257,12 @@ def _fit_column(
             log.warning(
                 "The number of actual bins for predictor '{}' is {} "
                 "which is smaller than the requested number of bins "
-                "{}"
-                .format(column_name, len(bin_edges) - 1, n_bins)
+                "{}".format(column_name, len(bin_edges) - 1, n_bins)
             )
 
         return self._compute_bins_from_edges(bin_edges)
 
-    def transform(
-        self,
-        data: pd.DataFrame,
-        column_names: list
-    ) -> pd.DataFrame:
+    def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
         """Discretize the data in the given list of columns.
 
         This is done by mapping each number to
@@ -315,16 +281,12 @@ def transform(
             data with additional discretized variables
         """
         if len(self._bins_by_column) == 0:
-            msg = (
-                "{} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
-            )
+            msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column_name in tqdm(column_names, desc="Discretizing columns..."):
             if column_name not in self._bins_by_column:
-                log.warning("Column '{}' is not in fitted output "
-                            "and will be skipped".format(column_name))
+                log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name))
                 continue
 
             # can be None for a column with a constant value!
@@ -334,11 +296,7 @@ def transform(
 
         return data
 
-    def _transform_column(
-        self, data: pd.DataFrame,
-        column_name: str,
-        bins: List[tuple]
-    ) -> pd.DataFrame:
+    def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tuple]) -> pd.DataFrame:
         """Create a new column with binned values of column_name.
 
         Parameters
@@ -360,18 +318,12 @@ def _transform_column(
         column_name_bin = column_name + "_bin"
 
         # use pd.cut to compute bins
-        data.loc[:, column_name_bin] = pd.cut(
-            x=data[column_name],
-            bins=interval_idx
-        )
+        data.loc[:, column_name_bin] = pd.cut(x=data[column_name], bins=interval_idx)
 
         # Rename bins so that the output has a proper format
         bin_labels = self._create_bin_labels(bins)
 
-        data.loc[:, column_name_bin] = (
-            data[column_name_bin]
-            .cat.rename_categories(bin_labels)
-        )
+        data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories(bin_labels)
 
         if data[column_name_bin].isnull().sum() > 0:
 
@@ -384,11 +336,7 @@ def _transform_column(
 
         return data
 
-    def fit_transform(
-        self,
-        data: pd.DataFrame,
-        column_names: list
-    ) -> pd.DataFrame:
+    def fit_transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
         """Fit to data, then transform it.
 
         Parameters
@@ -407,12 +355,7 @@ def fit_transform(
         return self.transform(data, column_names)
 
     def _compute_bin_edges(
-        self,
-        data: pd.DataFrame,
-        column_name: str,
-        n_bins: int,
-        col_min: float,
-        col_max: float
+        self, data: pd.DataFrame, column_name: str, n_bins: int, col_min: float, col_max: float
     ) -> list:
         """Compute the desired bin edges.
 
@@ -434,6 +377,7 @@ def _compute_bin_edges(
         list
             list of bin edges from which to compute the bins
         """
+        # fmt: off
         bin_edges = []
         if self.strategy == "quantile":
             bin_edges = list(
@@ -445,6 +389,7 @@ def _compute_bin_edges(
             )
         elif self.strategy == "uniform":
             bin_edges = list(np.linspace(col_min, col_max, n_bins + 1))
+        # fmt: on
 
         # nans lead to unexpected behavior during sorting,
         # by replacing with inf we ensure these stay at the
@@ -456,9 +401,7 @@ def _compute_bin_edges(
             bin_edges[-1] = np.inf
 
         if np.isnan(bin_edges).sum() > 0:
-            log.warning(
-                f"Column {column_name} has NaNs present in bin definitions"
-            )
+            log.warning(f"Column {column_name} has NaNs present in bin definitions")
 
         # Make absolutely sure bin edges are ordered,
         # in very rare situations this wasn't the case
@@ -532,10 +475,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:
         return bins
 
     @staticmethod
-    def _create_index(
-        intervals: List[tuple],
-        closed: str = "right"
-    ) -> pd.IntervalIndex:
+    def _create_index(intervals: List[tuple], closed: str = "right") -> pd.IntervalIndex:
         """
         Create an pd.IntervalIndex based on a list of tuples.
 
@@ -558,6 +498,7 @@ def _create_index(
             Description
         """
         # check if closed is of the proper form
+        # fmt: off
         if closed not in ["left", "right"]:
             raise ValueError(
                 "{}: valid options for 'closed' are {}. "
@@ -567,6 +508,7 @@ def _create_index(
                     ["left", "right"], closed
                 )
             )
+        # fmt: on
 
         # deepcopy variable because we do not want to modify the content
         # of intervals (which is still used outside of this function)
@@ -593,6 +535,7 @@ def _create_bin_labels(self, bins: List[tuple]) -> list:
             list of (formatted) bin labels
         """
         bin_labels = []
+        # fmt: off
         for interval in bins:
             bin_labels.append(
                 self.label_format
@@ -601,6 +544,7 @@ def _create_bin_labels(self, bins: List[tuple]) -> list:
                     interval[1]
                 )
             )
+        # fmt: on
 
         # Format first and last bin as < x and > y resp.
         if self.change_endpoint_format:
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index 64c0fa9..3a82efa 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -7,6 +7,7 @@
 import logging
 from random import shuffle
 from datetime import datetime
+from typing import Any, Set
 
 # third party imports
 import pandas as pd
@@ -58,7 +59,7 @@ def __init__(
         categorical_data_processor: CategoricalDataProcessor,
         discretizer: KBinsDiscretizer,
         target_encoder: TargetEncoder,
-        is_fitted: bool = False
+        is_fitted: bool = False,
     ):
         """Initialize the PreProcessor class."""
         self._categorical_data_processor = categorical_data_processor
@@ -88,7 +89,7 @@ def from_params(
         scale_contingency_table: bool = True,
         forced_categories: dict = {},
         weight: float = 0.0,
-        imputation_strategy: str = "mean"
+        imputation_strategy: str = "mean",
     ):
         """
         Instantiate a PreProcessor from given or default params.
@@ -159,19 +160,16 @@ def from_params(
         categorical_data_processor = CategoricalDataProcessor(
             model_type,
             regroup,
-            regroup_name, keep_missing,
+            regroup_name,
+            keep_missing,
             category_size_threshold,
             p_value_threshold,
             scale_contingency_table,
-            forced_categories
+            forced_categories,
         )
 
         discretizer = KBinsDiscretizer(
-            n_bins, strategy, closed,
-            auto_adapt_bins,
-            starting_precision,
-            label_format,
-            change_endpoint_format
+            n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format
         )
 
         target_encoder = TargetEncoder(weight, imputation_strategy)
@@ -203,15 +201,10 @@ def from_pipeline(cls, pipeline: dict):
             and no others.
         """
         if not PreProcessor._is_valid_pipeline(pipeline):
-            raise ValueError(
-                "Invalid pipeline, as it does not "
-                "contain all and only the required parameters."
-            )
+            raise ValueError("Invalid pipeline, as it does not " "contain all and only the required parameters.")
 
         categorical_data_processor = CategoricalDataProcessor()
-        categorical_data_processor.set_attributes_from_dict(
-            pipeline["categorical_data_processor"]
-        )
+        categorical_data_processor.set_attributes_from_dict(pipeline["categorical_data_processor"])
         # model_type = categorical_data_processor.model_type
 
         discretizer = KBinsDiscretizer()
@@ -220,20 +213,9 @@ def from_pipeline(cls, pipeline: dict):
         target_encoder = TargetEncoder()
         target_encoder.set_attributes_from_dict(pipeline["target_encoder"])
 
-        return cls(
-            categorical_data_processor,
-            discretizer,
-            target_encoder,
-            is_fitted=pipeline["_is_fitted"]
-        )
+        return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"])
 
-    def fit(
-        self,
-        train_data: pd.DataFrame,
-        continuous_vars: list,
-        discrete_vars: list,
-        target_column_name: str
-    ):
+    def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str):
         """Fit the data to the preprocessing pipeline.
 
         Parameters
@@ -248,13 +230,7 @@ def fit(
             Column name of the target.
         """
         # get list of all variables
-        preprocessed_variable_names = (
-            PreProcessor
-            ._get_variable_list(
-                continuous_vars,
-                discrete_vars
-            )
-        )
+        preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
 
         log.info("Starting to fit pipeline")
         start = time.time()
@@ -268,55 +244,25 @@ def fit(
         if continuous_vars:
             begin = time.time()
             self._discretizer.fit(train_data, continuous_vars)
-            log.info(
-                "Fitting KBinsDiscretizer took {} seconds"
-                .format(time.time() - begin)
-            )
+            log.info("Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin))
 
-            train_data = self._discretizer.transform(
-                train_data,
-                continuous_vars
-            )
+            train_data = self._discretizer.transform(train_data, continuous_vars)
         if discrete_vars:
             begin = time.time()
-            self._categorical_data_processor.fit(
-                train_data,
-                discrete_vars,
-                target_column_name
-            )
-            log.info(
-                "Fitting categorical_data_processor class took {} seconds"
-                .format(time.time() - begin)
-            )
+            self._categorical_data_processor.fit(train_data, discrete_vars, target_column_name)
+            log.info("Fitting categorical_data_processor class took {} seconds".format(time.time() - begin))
 
-            train_data = (
-                self._categorical_data_processor
-                .transform(train_data, discrete_vars)
-            )
+            train_data = self._categorical_data_processor.transform(train_data, discrete_vars)
 
         begin = time.time()
-        self._target_encoder.fit(
-            train_data, preprocessed_variable_names,
-            target_column_name
-        )
-        log.info(
-            "Fitting TargetEncoder took {} seconds"
-            .format(time.time() - begin)
-        )
+        self._target_encoder.fit(train_data, preprocessed_variable_names, target_column_name)
+        log.info("Fitting TargetEncoder took {} seconds".format(time.time() - begin))
 
         self._is_fitted = True  # set fitted boolean to True
 
-        log.info(
-            "Fitting pipeline took {} seconds"
-            .format(time.time() - start)
-        )
+        log.info("Fitting pipeline took {} seconds".format(time.time() - start))
 
-    def transform(
-        self,
-        data: pd.DataFrame,
-        continuous_vars: list,
-        discrete_vars: list
-    ) -> pd.DataFrame:
+    def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list) -> pd.DataFrame:
         """Transform the data by applying the preprocessing pipeline.
 
         Parameters
@@ -342,46 +288,26 @@ def transform(
 
         if not self._is_fitted:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
-        preprocessed_variable_names = (
-            PreProcessor
-            ._get_variable_list(
-                continuous_vars,
-                discrete_vars
-            )
-        )
+        preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
 
         if continuous_vars:
             data = self._discretizer.transform(data, continuous_vars)
 
         if discrete_vars:
-            data = self._categorical_data_processor.transform(
-                data,
-                discrete_vars
-            )
+            data = self._categorical_data_processor.transform(data, discrete_vars)
 
-        data = self._target_encoder.transform(
-            data,
-            preprocessed_variable_names
-        )
+        data = self._target_encoder.transform(data, preprocessed_variable_names)
 
-        log.info(
-            "Transforming data took {} seconds"
-            .format(time.time() - start)
-        )
+        log.info("Transforming data took {} seconds".format(time.time() - start))
 
         return data
 
     def fit_transform(
-        self,
-        train_data: pd.DataFrame,
-        continuous_vars: list,
-        discrete_vars: list,
-        target_column_name: str
+        self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str
     ) -> pd.DataFrame:
         """Fit preprocessing pipeline and transform the data.
 
@@ -401,21 +327,13 @@ def fit_transform(
         pd.DataFrame
             Transformed (preprocessed) data.
         """
-        self.fit(
-            train_data,
-            continuous_vars,
-            discrete_vars,
-            target_column_name
-        )
+        self.fit(train_data, continuous_vars, discrete_vars, target_column_name)
 
         return self.transform(train_data, continuous_vars, discrete_vars)
 
     @staticmethod
     def train_selection_validation_split(
-        data: pd.DataFrame,
-        train_prop: float = 0.6,
-        selection_prop: float = 0.2,
-        validation_prop: float = 0.2
+        data: pd.DataFrame, train_prop: float = 0.6, selection_prop: float = 0.2, validation_prop: float = 0.2
     ) -> pd.DataFrame:
         """Add `split` column with train/selection/validation values to the dataset.
 
@@ -440,10 +358,7 @@ def train_selection_validation_split(
             DataFrame with additional split column.
         """
         if not math.isclose(train_prop + selection_prop + validation_prop, 1.0):
-            raise ValueError(
-                "The sum of train_prop, selection_prop and "
-                "validation_prop must be 1.0."
-            )
+            raise ValueError("The sum of train_prop, selection_prop and " "validation_prop must be 1.0.")
 
         if train_prop == 0.0:
             raise ValueError("train_prop cannot be zero!")
@@ -455,22 +370,19 @@ def train_selection_validation_split(
         size_train = int(train_prop * nrows)
         size_select = int(selection_prop * nrows)
         size_valid = int(validation_prop * nrows)
-        correction = nrows - (size_train+size_select+size_valid)
+        correction = nrows - (size_train + size_select + size_valid)
 
         split = (
-            ['train'] * size_train
-            + ['train'] * correction
-            + ['selection'] * size_select
-            + ['validation'] * size_valid
+            ["train"] * size_train + ["train"] * correction + ["selection"] * size_select + ["validation"] * size_valid
         )
 
         shuffle(split)
 
-        data['split'] = split
+        data["split"] = split
 
         return data
 
-    def serialize_pipeline(self) -> dict:
+    def serialize_pipeline(self) -> dict[str, Any]:
         """
         Serialize the preprocessing pipeline.
 
@@ -482,23 +394,13 @@ def serialize_pipeline(self) -> dict:
         dict
             Return the pipeline as a dictionary.
         """
-        pipeline = {
-            "metadata": {
-                "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")
-            }
-        }
-
-        pipeline["categorical_data_processor"] = (
-            self
-            ._categorical_data_processor
-            .attributes_to_dict()
-        )
+        pipeline: dict[str, Any]
+        pipeline = {"metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")}}
+
+        pipeline["categorical_data_processor"] = self._categorical_data_processor.attributes_to_dict()
 
         pipeline["discretizer"] = self._discretizer.attributes_to_dict()
-        pipeline["target_encoder"] = (
-            self._target_encoder
-            .attributes_to_dict()
-        )
+        pipeline["target_encoder"] = self._target_encoder.attributes_to_dict()
 
         pipeline["_is_fitted"] = True
 
@@ -514,20 +416,11 @@ def _is_valid_pipeline(pipeline: dict) -> bool:
             Loaded pipeline from JSON file.
         """
         keys = inspect.getfullargspec(PreProcessor.from_params).args
-        valid_keys = set(
-            [
-                key for key in keys
-                if key not in ["cls", "serialization_path"]
-            ]
-        )
+        valid_keys = set([key for key in keys if key not in ["cls", "serialization_path"]])
 
-        input_keys = set()
+        input_keys: Set[str] = set()
         for key in pipeline:
-            if key in [
-                "categorical_data_processor",
-                "discretizer",
-                "target_encoder"
-            ]:
+            if key in ["categorical_data_processor", "discretizer", "target_encoder"]:
                 input_keys = input_keys.union(set(pipeline[key].keys()))
             elif key != "metadata":
                 input_keys.add(key)
@@ -560,8 +453,7 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list:
         ValueError
             In case both lists are empty.
         """
-        var_list = ([col + "_processed" for col in discrete_vars]
-                    + [col + "_bin" for col in continuous_vars])
+        var_list = [col + "_processed" for col in discrete_vars] + [col + "_bin" for col in continuous_vars]
 
         if not var_list:
             raise ValueError("Variable var_list is None or empty list.")
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index 7485b6b..cd6bc34 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -68,21 +68,14 @@ class TargetEncoder(BaseEstimator):
 
     valid_imputation_strategies = ("mean", "min", "max")
 
-    def __init__(
-        self, weight: float = 0.0,
-        imputation_strategy: str = "mean"
-    ):
+    def __init__(self, weight: float = 0.0, imputation_strategy: str = "mean"):
         """Initialize the TargetEncoder class."""
         if weight < 0:
             raise ValueError("The value of weight cannot be smaller than zero.")
         elif imputation_strategy not in self.valid_imputation_strategies:
             raise ValueError(
                 "Valid options for 'imputation_strategy' are {}. "
-                "Got imputation_strategy={!r} instead."
-                .format(
-                    self.valid_imputation_strategies,
-                    imputation_strategy
-                )
+                "Got imputation_strategy={!r} instead.".format(self.valid_imputation_strategies, imputation_strategy)
             )
 
         if weight == 0:
@@ -98,7 +91,7 @@ def __init__(
 
         self._mapping = {}  # placeholder for fitted output
         # placeholder for the global incidence of the data used for fitting
-        self._global_mean = None
+        self._global_mean: float
 
     def attributes_to_dict(self) -> dict:
         """Return the attributes of TargetEncoder in a dictionary.
@@ -111,10 +104,7 @@ def attributes_to_dict(self) -> dict:
         """
         params = self.get_params()
 
-        params["_mapping"] = {
-            key: value.to_dict()
-            for key, value in self._mapping.items()
-        }
+        params["_mapping"] = {key: value.to_dict() for key, value in self._mapping.items()}
 
         params["_global_mean"] = self._global_mean
 
@@ -132,8 +122,7 @@ def set_attributes_from_dict(self, params: dict):
         if "weight" in params and type(params["weight"]) == float:
             self.weight = params["weight"]
 
-        if ("imputation_strategy" in params and
-                params["imputation_strategy"] in self.valid_imputation_strategies):
+        if "imputation_strategy" in params and params["imputation_strategy"] in self.valid_imputation_strategies:
             self.imputation_strategy = params["imputation_strategy"]
 
         if "_global_mean" in params and type(params["_global_mean"]) == float:
@@ -148,19 +137,11 @@ def dict_to_series(key, value):
             s.index.name = key
             return s
 
-        self._mapping = {
-            key: dict_to_series(key, value)
-            for key, value in _mapping.items()
-        }
+        self._mapping = {key: dict_to_series(key, value) for key, value in _mapping.items()}
 
         return self
 
-    def fit(
-        self,
-        data: pd.DataFrame,
-        column_names: list,
-        target_column: str
-    ):
+    def fit(self, data: pd.DataFrame, column_names: list, target_column: str):
         """Fit the TargetEncoder to the data.
 
         Parameters
@@ -179,11 +160,7 @@ def fit(
 
         for column in tqdm(column_names, desc="Fitting target encoding..."):
             if column not in data.columns:
-                log.warning(
-                    "DataFrame has no column '{}', so it will be "
-                    "skipped in fitting"
-                    .format(column)
-                )
+                log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column))
                 continue
 
             self._mapping[column] = self._fit_column(data[column], y)
@@ -211,18 +188,13 @@ def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series:
         stats = y.groupby(X).agg(["mean", "count"])
 
         # Note: if self.weight = 0, we have the ordinary incidence replacement
-        numerator = (stats["count"] * stats["mean"]
-                     + self.weight * self._global_mean)
+        numerator = stats["count"] * stats["mean"] + self.weight * self._global_mean
 
         denominator = stats["count"] + self.weight
 
         return numerator / denominator
 
-    def transform(
-        self,
-        data: pd.DataFrame,
-        column_names: list
-    ) -> pd.DataFrame:
+    def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
         """Replace (e.g. encode) values of each categorical column with a
         new value (reflecting the corresponding average target value,
         optionally smoothed by a regularization weight),
@@ -248,29 +220,22 @@ def transform(
         """
         if (len(self._mapping) == 0) or (self._global_mean is None):
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with "
-                "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column in tqdm(column_names, desc="Applying target encoding..."):
             if column not in data.columns:
-                log.warning("Unknown column '{}' will be skipped."
-                            .format(column))
+                log.warning("Unknown column '{}' will be skipped.".format(column))
                 continue
             elif column not in self._mapping:
-                log.warning("Column '{}' is not in fitted output "
-                            "and will be skipped.".format(column))
+                log.warning("Column '{}' is not in fitted output " "and will be skipped.".format(column))
                 continue
             data = self._transform_column(data, column)
 
         return data
 
-    def _transform_column(
-        self,
-        data: pd.DataFrame,
-        column_name: str
-    ) -> pd.DataFrame:
+    def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame:
         """Replace (e.g. encode) values of a categorical column with a
         new value (reflecting the corresponding average target value,
         optionally smoothed by a regularization weight),
@@ -293,10 +258,7 @@ def _transform_column(
         # Convert dtype to float, because when the original dtype
         # is of type "category", the resulting dtype would otherwise also be of
         # type "category":
-        data[new_column] = (
-            data[column_name].map(self._mapping[column_name])
-            .astype("float")
-        )
+        data[new_column] = data[column_name].map(self._mapping[column_name]).astype("float")
 
         # In case of categorical data, it could be that new categories will
         # emerge which were not present in the train set, so this will result
@@ -312,12 +274,7 @@ def _transform_column(
 
         return data
 
-    def fit_transform(
-        self,
-        data: pd.DataFrame,
-        column_names: list,
-        target_column: str
-    ) -> pd.DataFrame:
+    def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame:
         """Fit the encoder and transform the data.
 
         Parameters
diff --git a/cobra/utils.py b/cobra/utils.py
index b7727dd..0287947 100644
--- a/cobra/utils.py
+++ b/cobra/utils.py
@@ -8,6 +8,4 @@ def clean_predictor_name(predictor_name: str) -> str:
     This is done by stripping the redundant suffix (e.g. "_enc" or "_bin") off
     from the end of the predictor name to return a clean version of the predictor
     """
-    return (predictor_name.replace("_enc", "")
-                          .replace("_bin", "")
-                          .replace("_processed", ""))
+    return predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
diff --git a/requirements.dev.txt b/requirements.dev.txt
index 3d87710..9534dc0 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -1,6 +1,6 @@
+black>=22.3.0
 mypy>=0.942
 pycodestyle>=2.8.0
 pydocstyle>=6.1.1
-pylint>=2.13.7
 pytest>=7.1.1
 pytest-mock>=3.7.0
\ No newline at end of file

From aac52033e6fb460073b4bf75dfa203edc2208983 Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 20 May 2022 13:18:55 +0200
Subject: [PATCH 6/9] feat: add make black command, line length 80 instead of
 120

---
 Makefile                                      |   5 +-
 cobra/evaluation/evaluator.py                 | 112 ++++++++++++++----
 cobra/evaluation/pigs_tables.py               |  68 +++++++++--
 cobra/evaluation/plotting_utils.py            |  28 ++++-
 cobra/model_building/forward_selection.py     | 112 ++++++++++++++----
 cobra/model_building/models.py                |  45 +++++--
 cobra/model_building/univariate_selection.py  |  54 +++++++--
 cobra/preprocessing/__init__.py               |   7 +-
 .../categorical_data_processor.py             |  77 +++++++++---
 cobra/preprocessing/kbins_discretizer.py      |  63 +++++++---
 cobra/preprocessing/preprocessor.py           | 106 +++++++++++++----
 cobra/preprocessing/target_encoder.py         |  38 ++++--
 cobra/utils.py                                |   4 +-
 13 files changed, 564 insertions(+), 155 deletions(-)

diff --git a/Makefile b/Makefile
index 4789718..c0b115b 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,10 @@ test-unit:
 	@echo 'unit tests OK'
 
 black-check:
-	black --diff --line-length 120 cobra/
+	black --diff --line-length 80 cobra/
+
+black:
+	black cobra/
 
 typecheck:
 	mypy cobra --allow-redefinition --allow-untyped-globals --ignore-missing-imports
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index 22e034b..278bf9d 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -61,7 +61,9 @@ class ClassificationEvaluator:
         (by default 10, so deciles).
     """
 
-    def __init__(self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10):
+    def __init__(
+        self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10
+    ):
         """Initialize the ClassificationEvaluator."""
         self.y_true: np.ndarray
         self.y_pred: np.ndarray
@@ -91,14 +93,21 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
 
         # if probability_cutoff is not set, take the optimal cut-off
         if not self.probability_cutoff:
-            self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
+            self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff(
+                fpr, tpr, thresholds
+            )
 
         # Transform probabilities to binary array using cut-off
-        y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 for pred in y_pred])
+        y_pred_b = np.array(
+            [0 if pred <= self.probability_cutoff else 1 for pred in y_pred]
+        )
 
         # Compute the various evaluation metrics
         self.scalar_metrics = cast(
-            pd.Series, ClassificationEvaluator._compute_scalar_metrics(y_true, y_pred, y_pred_b, self.lift_at)
+            pd.Series,
+            ClassificationEvaluator._compute_scalar_metrics(
+                y_true, y_pred, y_pred_b, self.lift_at
+            ),
         )
 
         self.y_true = y_true
@@ -106,8 +115,12 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
 
         self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
         self.confusion_matrix = confusion_matrix(y_true, y_pred_b)
-        self.lift_curve = ClassificationEvaluator._compute_lift_per_bin(y_true, y_pred, self.n_bins)
-        self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains(y_true, y_pred)
+        self.lift_curve = ClassificationEvaluator._compute_lift_per_bin(
+            y_true, y_pred, self.n_bins
+        )
+        self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains(
+            y_true, y_pred
+        )
 
     @staticmethod
     def _compute_scalar_metrics(
@@ -153,7 +166,10 @@ def _compute_scalar_metrics(
                 "F1": f1_score(y_true, y_pred_b, average=None)[1],
                 "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
                 f"lift at {lift_at}": np.round(
-                    ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=lift_at), 2
+                    ClassificationEvaluator._compute_lift(
+                        y_true=y_true, y_pred=y_pred, lift_at=lift_at
+                    ),
+                    2,
                 ),
             }
         )
@@ -175,7 +191,8 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
         """
         if self.roc_curve is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -202,7 +219,9 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)):
 
         plt.show()
 
-    def plot_confusion_matrix(self, path: str = None, dim: tuple = (12, 8), labels: list = None):
+    def plot_confusion_matrix(
+        self, path: str = None, dim: tuple = (12, 8), labels: list = None
+    ):
         """Plot the confusion matrix.
 
         Parameters
@@ -222,7 +241,8 @@ def plot_confusion_matrix(self, path: str = None, dim: tuple = (12, 8), labels:
         labels = labels or DEFAULT_LABELS
         if self.confusion_matrix is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -259,7 +279,8 @@ def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8))
         """
         if self.lift_curve is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -319,7 +340,8 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
         """
         if self.lift_curve is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -334,7 +356,15 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)):
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
-            plt.axhline(y=1, color="darkorange", linestyle="--", xmin=0.05, xmax=0.95, linewidth=3, label="Baseline")
+            plt.axhline(
+                y=1,
+                color="darkorange",
+                linestyle="--",
+                xmin=0.05,
+                xmax=0.95,
+                linewidth=3,
+                label="Baseline",
+            )
 
             # Legend
             ax.legend(loc="upper right")
@@ -373,7 +403,14 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)):
                 linewidth=3,
                 label="cumulative gains",
             )
-            ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, ls="--", color="darkorange", label="random selection")
+            ax.plot(
+                ax.get_xlim(),
+                ax.get_ylim(),
+                linewidth=3,
+                ls="--",
+                color="darkorange",
+                label="random selection",
+            )
 
             ax.set_title("Cumulative Gains curve", fontsize=20)
 
@@ -417,7 +454,9 @@ def _find_optimal_cutoff(y_true: np.ndarray, y_pred: np.ndarray) -> float:
         return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
 
     @staticmethod
-    def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray) -> float:
+    def _compute_optimal_cutoff(
+        fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray
+    ) -> float:
         """Calculate the optimal probability cut-off point for a classification model.
 
         The optimal cut-off would be where TPR is high and FPR is low, hence
@@ -445,7 +484,9 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.nda
         return thresholds[optimal_index][0]
 
     @staticmethod
-    def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    def _compute_cumulative_gains(
+        y_true: np.ndarray, y_pred: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Compute cumulative gains of the model.
 
         Code from (https://github.com/reiinakano/scikit-plot/blob/
@@ -503,7 +544,9 @@ def _compute_lift_per_bin(
             Includes x-labels, lifts per decile, and target incidence.
         """
         lifts = [
-            ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=perc_lift)
+            ClassificationEvaluator._compute_lift(
+                y_true=y_true, y_pred=y_pred, lift_at=perc_lift
+            )
             for perc_lift in np.linspace(1 / n_bins, 1, num=n_bins, endpoint=True)
         ]
 
@@ -512,7 +555,9 @@ def _compute_lift_per_bin(
         return x_labels, lifts, cast(float, y_true.mean())
 
     @staticmethod
-    def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05) -> float:
+    def _compute_lift(
+        y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05
+    ) -> float:
         """Calculate lift on a specified level.
 
         Parameters
@@ -589,7 +634,9 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
             Model scores.
         """
         # Compute the various evaluation metrics
-        self.scalar_metrics = RegressionEvaluator._compute_scalar_metrics(y_true, y_pred)
+        self.scalar_metrics = RegressionEvaluator._compute_scalar_metrics(
+            y_true, y_pred
+        )
 
         self.y_true = y_true
         self.y_pred = y_pred
@@ -651,7 +698,9 @@ def _compute_qq_residuals(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series:
 
         df["z_res"] = df["res"].apply(lambda x: (x - m) / s)
         df["rank"] = df.index + 1
-        df["percentile"] = df["rank"].apply(lambda x: x / (n + 1))  # divide by n+1 to avoid inf
+        df["percentile"] = df["rank"].apply(
+            lambda x: x / (n + 1)
+        )  # divide by n+1 to avoid inf
         df["q_theoretical"] = norm.ppf(df["percentile"])
 
         return pd.Series(
@@ -678,7 +727,8 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
         """
         if self.y_true is None and self.y_pred is None:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -690,7 +740,9 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)):
 
             x = np.arange(1, len(y_true) + 1)
 
-            ax.plot(x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3)
+            ax.plot(
+                x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3
+            )
             ax.plot(x, y_pred, label="predictions", color="cornflowerblue", linewidth=3)
 
             ax.set_xlabel("Index", fontsize=15)
@@ -728,14 +780,24 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)):
             x = self.qq["quantiles"]
             y = self.qq["residuals"]
 
-            ax.plot(x, x, ls="--", label="perfect model", color="darkorange", linewidth=3)
+            ax.plot(
+                x, x, ls="--", label="perfect model", color="darkorange", linewidth=3
+            )
             ax.plot(x, y, label="current model", color="cornflowerblue", linewidth=3)
 
             ax.set_xlabel("Theoretical quantiles", fontsize=15)
-            ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1))
+            ax.set_xticks(
+                range(
+                    int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1
+                )
+            )
 
             ax.set_ylabel("Standardized residuals", fontsize=15)
-            ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1))
+            ax.set_yticks(
+                range(
+                    int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1
+                )
+            )
 
             ax.legend(loc="best")
             ax.set_title("Q-Q plot", fontsize=20)
diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
index dfbd85c..08e89e2 100644
--- a/cobra/evaluation/pigs_tables.py
+++ b/cobra/evaluation/pigs_tables.py
@@ -10,7 +10,10 @@
 
 
 def generate_pig_tables(
-    basetable: pd.DataFrame, id_column_name: str, target_column_name: str, preprocessed_predictors: list
+    basetable: pd.DataFrame,
+    id_column_name: str,
+    target_column_name: str,
+    preprocessed_predictors: list,
 ) -> pd.DataFrame:
     """Compute PIG tables for all predictors in preprocessed_predictors.
 
@@ -44,7 +47,10 @@ def generate_pig_tables(
 
 
 def compute_pig_table(
-    basetable: pd.DataFrame, predictor_column_name: str, target_column_name: str, id_column_name: str
+    basetable: pd.DataFrame,
+    predictor_column_name: str,
+    target_column_name: str,
+    id_column_name: str,
 ) -> pd.DataFrame:
     """Compute the PIG table of a given predictor for a given target.
 
@@ -73,7 +79,13 @@ def compute_pig_table(
         basetable.groupby(predictor_column_name)
         .agg({target_column_name: "mean", id_column_name: "size"})
         .reset_index()
-        .rename(columns={predictor_column_name: "label", target_column_name: "avg_target", id_column_name: "pop_size"})
+        .rename(
+            columns={
+                predictor_column_name: "label",
+                target_column_name: "avg_target",
+                id_column_name: "pop_size",
+            }
+        )
     )
 
     # add the column name to a variable column
@@ -90,7 +102,11 @@ def compute_pig_table(
 
 
 def plot_incidence(
-    pig_tables: pd.DataFrame, variable: str, model_type: str, column_order: list = None, dim: tuple = (12, 8)
+    pig_tables: pd.DataFrame,
+    variable: str,
+    model_type: str,
+    column_order: list = None,
+    dim: tuple = (12, 8),
 ):
     """Plot a Predictor Insights Graph (PIG).
 
@@ -125,14 +141,19 @@ def plot_incidence(
     """
     if model_type not in ["classification", "regression"]:
         raise ValueError(
-            "An unexpected value was set for the model_type " "parameter. Expected 'classification' or " "'regression'."
+            "An unexpected value was set for the model_type "
+            "parameter. Expected 'classification' or "
+            "'regression'."
         )
 
     df_plot = pig_tables[pig_tables["variable"] == variable].copy()
 
     if column_order is not None:
         if not set(df_plot["label"]) == set(column_order):
-            raise ValueError("The column_order and pig_tables parameters do not contain " "the same set of variables.")
+            raise ValueError(
+                "The column_order and pig_tables parameters do not contain "
+                "the same set of variables."
+            )
 
         df_plot["label"] = df_plot["label"].astype("category")
         df_plot["label"].cat.reorder_categories(column_order, inplace=True)
@@ -156,7 +177,9 @@ def plot_incidence(
             marker=".",
             markersize=20,
             linewidth=3,
-            label="incidence rate per bin" if model_type == "classification" else "mean target value per bin",
+            label="incidence rate per bin"
+            if model_type == "classification"
+            else "mean target value per bin",
             zorder=10,
         )
 
@@ -166,7 +189,9 @@ def plot_incidence(
             color="#022252",
             linestyle="--",
             linewidth=4,
-            label="average incidence rate" if model_type == "classification" else "global mean target value",
+            label="average incidence rate"
+            if model_type == "classification"
+            else "global mean target value",
             zorder=10,
         )
 
@@ -174,7 +199,10 @@ def plot_incidence(
         ax.plot(np.nan, "#939598", linewidth=6, label="bin size")
 
         # Set labels & ticks
-        ax.set_ylabel("incidence" if model_type == "classification" else "mean target value", fontsize=16)
+        ax.set_ylabel(
+            "incidence" if model_type == "classification" else "mean target value",
+            fontsize=16,
+        )
         ax.set_xlabel(f"{variable} bins" "", fontsize=16)
         ax.xaxis.set_tick_params(labelsize=14)
         plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
@@ -196,9 +224,17 @@ def plot_incidence(
             # the bins and versus the global avg. target.
             # (Motivation for the AND above: if on one end there IS enough
             # difference, the effect that we discuss here does not occur.)
-            global_avg_target = max(df_plot["global_avg_target"])  # series of same number, for every bin.
-            if (np.abs((max(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25) and (
-                np.abs((min(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25
+            global_avg_target = max(
+                df_plot["global_avg_target"]
+            )  # series of same number, for every bin.
+            if (
+                np.abs((max(df_plot["avg_target"]) - global_avg_target))
+                / global_avg_target
+                < 0.25
+            ) and (
+                np.abs((min(df_plot["avg_target"]) - global_avg_target))
+                / global_avg_target
+                < 0.25
             ):
                 ax.set_ylim(global_avg_target * 0.75, global_avg_target * 1.25)
 
@@ -212,7 +248,13 @@ def plot_incidence(
         # -----------------
         ax2 = ax.twinx()
 
-        ax2.bar(df_plot["label"], df_plot["pop_size"], align="center", color="#939598", zorder=1)
+        ax2.bar(
+            df_plot["label"],
+            df_plot["pop_size"],
+            align="center",
+            color="#939598",
+            zorder=1,
+        )
 
         # Set labels & ticks
         ax2.set_xlabel(f"{variable} bins" "", fontsize=16)
diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py
index 19fbf64..5e77192 100644
--- a/cobra/evaluation/plotting_utils.py
+++ b/cobra/evaluation/plotting_utils.py
@@ -12,7 +12,9 @@
 DEFAULT_COLOURS = {"train": "#0099bf", "selection": "#ff9500", "validation": "#8064a2"}
 
 
-def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None):
+def plot_univariate_predictor_quality(
+    df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None
+):
     """Plot univariate quality of the predictors.
 
     Parameters
@@ -33,7 +35,9 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12,
         metric = "RMSE"
         ascending = True
 
-    df = df_metric[df_metric["preselection"]].sort_values(by=metric + " selection", ascending=ascending)
+    df = df_metric[df_metric["preselection"]].sort_values(
+        by=metric + " selection", ascending=ascending
+    )
 
     df = pd.melt(
         df,
@@ -62,7 +66,9 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12,
         plt.show()
 
 
-def plot_correlation_matrix(df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None):
+def plot_correlation_matrix(
+    df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None
+):
     """Plot correlation matrix amongst the predictors.
 
     Parameters
@@ -158,7 +164,9 @@ def plot_performance_curves(
 
         # Set x- and y-ticks
         ax.set_xticks(np.arange(len(model_performance["last_added_predictor"])))
-        ax.set_xticklabels(model_performance["last_added_predictor"].tolist(), rotation=40, ha="right")
+        ax.set_xticklabels(
+            model_performance["last_added_predictor"].tolist(), rotation=40, ha="right"
+        )
 
         if model_type == "classification":
             ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05))
@@ -181,7 +189,10 @@ def plot_performance_curves(
 
 
 def plot_variable_importance(
-    df_variable_importance: pd.DataFrame, title: str = None, dim: tuple = (12, 8), path: str = None
+    df_variable_importance: pd.DataFrame,
+    title: str = None,
+    dim: tuple = (12, 8),
+    path: str = None,
 ):
     """Plot variable importance of a given model.
 
@@ -198,7 +209,12 @@ def plot_variable_importance(
     """
     with plt.style.context("seaborn-ticks"):
         fig, ax = plt.subplots(figsize=dim)  # pylint: disable=unused-variable
-        ax = sns.barplot(x="importance", y="predictor", data=df_variable_importance, color="cornflowerblue")
+        ax = sns.barplot(
+            x="importance",
+            y="predictor",
+            data=df_variable_importance,
+            color="cornflowerblue",
+        )
         if title:
             ax.set_title(title)
         else:
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index ee75b02..1733616 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -43,7 +43,12 @@ class ForwardFeatureSelection:
         List of fitted models.
     """
 
-    def __init__(self, model_type: str = "classification", max_predictors: int = 50, pos_only: bool = True):
+    def __init__(
+        self,
+        model_type: str = "classification",
+        max_predictors: int = 50,
+        pos_only: bool = True,
+    ):
         """Initialize the ForwardFeatureSelection class."""
         self.model_type = model_type
         if model_type == "classification":
@@ -75,7 +80,10 @@ def get_model_from_step(self, step: int) -> Model:
             In case step is larger than the number of available models.
         """
         if len(self._fitted_models) <= step:
-            raise ValueError(f"No model available for step {step}. " "The first step starts from index 0.")
+            raise ValueError(
+                f"No model available for step {step}. "
+                "The first step starts from index 0."
+            )
 
         return self._fitted_models[step]
 
@@ -122,7 +130,10 @@ def compute_model_performances(
 
         for model in self._fitted_models:
             last_added_predictor = set(model.predictors).difference(predictor_set)
-            tmp = {"predictors": model.predictors, "last_added_predictor": list(last_added_predictor)[0]}
+            tmp = {
+                "predictors": model.predictors,
+                "last_added_predictor": list(last_added_predictor)[0],
+            }
 
             # Evaluate model on each dataset split,
             # e.g. train-selection-validation
@@ -179,26 +190,44 @@ def fit(
             In case the number of forced predictors is larger than the maximum
             number of allowed predictors in the model.
         """
-        assert "split" in train_data.columns, "The train_data input df does not include a split column."
         assert (
-            len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0
+            "split" in train_data.columns
+        ), "The train_data input df does not include a split column."
+        assert (
+            len(
+                set(["train", "selection"]).difference(
+                    set(train_data["split"].unique())
+                )
+            )
+            == 0
         ), "The train_data input df does not include a 'train' and 'selection' split."
 
         # remove excluded predictors from predictor lists
         forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS
         excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS
         filtered_predictors = [
-            var for var in predictors if (var not in excluded_predictors and var not in forced_predictors)
+            var
+            for var in predictors
+            if (var not in excluded_predictors and var not in forced_predictors)
         ]
 
         # checks on predictor lists and self.max_predictors attr
         if len(forced_predictors) > self.max_predictors:
-            raise ValueError("Size of forced_predictors cannot be bigger than " "max_predictors.")
+            raise ValueError(
+                "Size of forced_predictors cannot be bigger than " "max_predictors."
+            )
         elif len(forced_predictors) == self.max_predictors:
-            log.info("Size of forced_predictors equals max_predictors " "only one model will be trained...")
+            log.info(
+                "Size of forced_predictors equals max_predictors "
+                "only one model will be trained..."
+            )
             # train model with all forced_predictors (only)
             self._fitted_models.append(
-                self._train_model(train_data[train_data["split"] == "train"], target_column_name, forced_predictors)
+                self._train_model(
+                    train_data[train_data["split"] == "train"],
+                    target_column_name,
+                    forced_predictors,
+                )
             )
 
         else:
@@ -207,7 +236,11 @@ def fit(
             )
 
     def _forward_selection(
-        self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list = None
+        self,
+        train_data: pd.DataFrame,
+        target_column_name: str,
+        predictors: list,
+        forced_predictors: list = None,
     ) -> list[Model]:
         """Perform the forward feature selection algorithm.
 
@@ -237,22 +270,34 @@ def _forward_selection(
         fitted_models: list[Model] = []
         current_predictors: list[str] = []
 
-        max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors))
+        max_steps = 1 + min(
+            self.max_predictors, len(predictors) + len(forced_predictors)
+        )
 
-        for step in tqdm(range(1, max_steps), desc="Sequentially adding best predictor..."):
+        for step in tqdm(
+            range(1, max_steps), desc="Sequentially adding best predictor..."
+        ):
             if step <= len(forced_predictors):
                 # first, we go through the forced predictors
-                candidate_predictors = [var for var in forced_predictors if var not in current_predictors]
+                candidate_predictors = [
+                    var for var in forced_predictors if var not in current_predictors
+                ]
             else:
                 candidate_predictors = [
-                    var for var in (predictors + forced_predictors) if var not in current_predictors
+                    var
+                    for var in (predictors + forced_predictors)
+                    if var not in current_predictors
                 ]
 
-            model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors)
+            model = self._find_next_best_model(
+                train_data, target_column_name, candidate_predictors, current_predictors
+            )
 
             if model is not None:
                 # Add new model predictors to the list of current predictors
-                current_predictors = list(set(current_predictors).union(set(model.predictors)))
+                current_predictors = list(
+                    set(current_predictors).union(set(model.predictors))
+                )
 
                 fitted_models.append(model)
             # else:
@@ -267,7 +312,11 @@ def _forward_selection(
         return fitted_models
 
     def _find_next_best_model(
-        self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list
+        self,
+        train_data: pd.DataFrame,
+        target_column_name: str,
+        candidate_predictors: list,
+        current_predictors: list,
     ) -> Model:
         """
         Find the next best model with candidate predictors.
@@ -312,16 +361,24 @@ def _find_next_best_model(
                 "ForwardFeatureSelection argument."
             )
 
-        fit_data = train_data[train_data["split"] == "train"]  # data to fit the models with
-        sel_data = train_data[train_data["split"] == "selection"]  # data to compare the models with
+        fit_data = train_data[
+            train_data["split"] == "train"
+        ]  # data to fit the models with
+        sel_data = train_data[
+            train_data["split"] == "selection"
+        ]  # data to compare the models with
 
         for pred in candidate_predictors:
             # Train a model with an additional predictor
-            model = self._train_model(fit_data, target_column_name, (current_predictors + [pred]))
+            model = self._train_model(
+                fit_data, target_column_name, (current_predictors + [pred])
+            )
 
             # Evaluate the model
             performance = model.evaluate(
-                sel_data[current_predictors + [pred]], sel_data[target_column_name], split="selection"
+                sel_data[current_predictors + [pred]],
+                sel_data[target_column_name],
+                split="selection",
             )
 
             if self.pos_only and (not (model.get_coef() >= 0).all()):
@@ -329,16 +386,23 @@ def _find_next_best_model(
 
             # Check if the model is better than the current best model
             # and if it is, replace the current best.
-            if self.MLModel == LogisticRegressionModel and performance > best_performance:  # AUC metric is used
+            if (
+                self.MLModel == LogisticRegressionModel
+                and performance > best_performance
+            ):  # AUC metric is used
                 best_performance = performance
                 best_model = model
-            elif self.MLModel == LinearRegressionModel and performance < best_performance:  # RMSE metric is used
+            elif (
+                self.MLModel == LinearRegressionModel and performance < best_performance
+            ):  # RMSE metric is used
                 best_performance = performance
                 best_model = model
 
         return cast(Model, best_model)
 
-    def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list) -> Model:
+    def _train_model(
+        self, train_data: pd.DataFrame, target_column_name: str, predictors: list
+    ) -> Model:
         """Train the model with a given set of predictors.
 
         Parameters
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 408ead4..ba8befd 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -33,7 +33,9 @@ class LogisticRegressionModel:
 
     def __init__(self):
         """Initialize the LogisticRegressionModel class."""
-        self.logit = LogisticRegression(fit_intercept=True, C=1e9, solver="liblinear", random_state=42)
+        self.logit = LogisticRegression(
+            fit_intercept=True, C=1e9, solver="liblinear", random_state=42
+        )
         self._is_fitted = False
         # placeholder to keep track of a list of predictors
         self.predictors = []
@@ -152,7 +154,13 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         # ensure we have the proper predictors and the proper order
         return self.logit.predict_proba(X[self.predictors])[:, 1]
 
-    def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float:
+    def evaluate(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        split: str = None,
+        metric: Optional[Callable] = None,
+    ) -> float:
         """
         Evaluate the model on a given dataset (X, y).
 
@@ -185,7 +193,9 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Opt
             y_pred = self.score_model(X)
 
             fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred)
-            cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)
+            cutoff = ClassificationEvaluator._compute_optimal_cutoff(
+                fpr, tpr, thresholds
+            )
             y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred])
 
             performance = metric(y_true=y, y_pred=y_pred_b)
@@ -219,11 +229,15 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         y_pred = self.score_model(data)
 
         importance_by_variable = {
-            utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0]
+            utils.clean_predictor_name(predictor): stats.pearsonr(
+                data[predictor], y_pred
+            )[0]
             for predictor in self.predictors
         }
 
-        df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index()
+        df = pd.DataFrame.from_dict(
+            importance_by_variable, orient="index"
+        ).reset_index()
         df.columns = ["predictor", "importance"]
 
         return df.sort_values(by="importance", ascending=False).reset_index(drop=True)
@@ -285,7 +299,10 @@ def serialize(self) -> dict:
 
         if self._is_fitted:
             serialized_model.update(
-                {"coef_": self.linear.coef_.tolist(), "intercept_": self.linear.intercept_.tolist()}
+                {
+                    "coef_": self.linear.coef_.tolist(),
+                    "intercept_": self.linear.intercept_.tolist(),
+                }
             )
 
         return serialized_model
@@ -374,7 +391,13 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         # ensure we have the proper predictors and the proper order
         return self.linear.predict(X[self.predictors])
 
-    def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float:
+    def evaluate(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        split: str = None,
+        metric: Optional[Callable] = None,
+    ) -> float:
         """Evaluate the model on a given dataset (X, y).
 
         The optional split
@@ -434,11 +457,15 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         y_pred = self.score_model(data)
 
         importance_by_variable = {
-            utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0]
+            utils.clean_predictor_name(predictor): stats.pearsonr(
+                data[predictor], y_pred
+            )[0]
             for predictor in self.predictors
         }
 
-        df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index()
+        df = pd.DataFrame.from_dict(
+            importance_by_variable, orient="index"
+        ).reset_index()
         df.columns = ["predictor", "importance"]
 
         return df.sort_values(by="importance", ascending=False).reset_index(drop=True)
diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 2d90b48..d6c1901 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -74,14 +74,22 @@ def compute_univariate_preselection(
             cleaned_predictor = utils.clean_predictor_name(predictor)
 
             auc_train = roc_auc_score(
-                y_true=target_enc_train_data[target_column], y_score=target_enc_train_data[predictor]
+                y_true=target_enc_train_data[target_column],
+                y_score=target_enc_train_data[predictor],
             )
 
             auc_selection = roc_auc_score(
-                y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor]
+                y_true=target_enc_selection_data[target_column],
+                y_score=target_enc_selection_data[predictor],
             )
 
-            result.append({"predictor": cleaned_predictor, "AUC train": auc_train, "AUC selection": auc_selection})
+            result.append(
+                {
+                    "predictor": cleaned_predictor,
+                    "AUC train": auc_train,
+                    "AUC selection": auc_selection,
+                }
+            )
 
         df_auc = pd.DataFrame(result)
 
@@ -95,23 +103,35 @@ def compute_univariate_preselection(
 
         df_auc["preselection"] = auc_thresh & auc_overtrain
 
-        df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)
+        df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(
+            drop=True
+        )
 
     elif model_type == "regression":
         for predictor in predictors:
             cleaned_predictor = utils.clean_predictor_name(predictor)
 
             rmse_train = sqrt(
-                mean_squared_error(y_true=target_enc_train_data[target_column], y_pred=target_enc_train_data[predictor])
+                mean_squared_error(
+                    y_true=target_enc_train_data[target_column],
+                    y_pred=target_enc_train_data[predictor],
+                )
             )
 
             rmse_selection = sqrt(
                 mean_squared_error(
-                    y_true=target_enc_selection_data[target_column], y_pred=target_enc_selection_data[predictor]
+                    y_true=target_enc_selection_data[target_column],
+                    y_pred=target_enc_selection_data[predictor],
                 )
             )
 
-            result.append({"predictor": cleaned_predictor, "RMSE train": rmse_train, "RMSE selection": rmse_selection})
+            result.append(
+                {
+                    "predictor": cleaned_predictor,
+                    "RMSE train": rmse_train,
+                    "RMSE selection": rmse_selection,
+                }
+            )
 
         df_rmse = pd.DataFrame(result)
 
@@ -120,12 +140,16 @@ def compute_univariate_preselection(
 
         # Identify those variables for which the RMSE difference between train
         # and selection is within a user-defined ratio
-        preselect_overtrain = df_rmse["RMSE selection"] - df_rmse["RMSE train"]  # flip subtraction vs. AUC
+        preselect_overtrain = (
+            df_rmse["RMSE selection"] - df_rmse["RMSE train"]
+        )  # flip subtraction vs. AUC
         rmse_overtrain = preselect_overtrain < preselect_overtrain_threshold
 
         df_rmse["preselection"] = rmse_thresh & rmse_overtrain
 
-        df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True)  # lower is better
+        df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(
+            drop=True
+        )  # lower is better
 
     return df_out
 
@@ -147,7 +171,9 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     """
     if "AUC selection" in df_metric.columns:
         predictor_list = (
-            df_metric[df_metric["preselection"]].sort_values(by="AUC selection", ascending=False).predictor.tolist()
+            df_metric[df_metric["preselection"]]
+            .sort_values(by="AUC selection", ascending=False)
+            .predictor.tolist()
         )
     elif "RMSE selection" in df_metric.columns:
         predictor_list = (
@@ -159,7 +185,9 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     return [col + "_enc" for col in predictor_list]
 
 
-def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame:
+def compute_correlations(
+    target_enc_train_data: pd.DataFrame, predictors: list
+) -> pd.DataFrame:
     """Compute the correlations amongst the predictors in the DataFrame.
 
     Parameters
@@ -177,7 +205,9 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list)
     """
     correlations = target_enc_train_data[predictors].corr()
 
-    predictors_cleaned = [utils.clean_predictor_name(predictor) for predictor in predictors]
+    predictors_cleaned = [
+        utils.clean_predictor_name(predictor) for predictor in predictors
+    ]
 
     # Change index and columns with the cleaned version of the predictors
     # e.g. change "var1_enc" with "var1"
diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py
index 55e036b..cd8579a 100644
--- a/cobra/preprocessing/__init__.py
+++ b/cobra/preprocessing/__init__.py
@@ -5,4 +5,9 @@
 from .categorical_data_processor import CategoricalDataProcessor
 from .preprocessor import PreProcessor
 
-__all__ = ["KBinsDiscretizer", "TargetEncoder", "CategoricalDataProcessor", "PreProcessor"]
+__all__ = [
+    "KBinsDiscretizer",
+    "TargetEncoder",
+    "CategoricalDataProcessor",
+    "PreProcessor",
+]
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index 6632720..ba762ed 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -114,7 +114,8 @@ def attributes_to_dict(self) -> dict:
         params = self.get_params()
 
         params["_cleaned_categories_by_column"] = {
-            key: list(value) for key, value in self._cleaned_categories_by_column.items()
+            key: list(value)
+            for key, value in self._cleaned_categories_by_column.items()
         }
 
         return params
@@ -148,7 +149,9 @@ def set_attributes_from_dict(self, params: dict):
         # of the following method from BaseEstimator:
         self.set_params(**params)
 
-        self._cleaned_categories_by_column = {key: set(value) for key, value in _fitted_output.items()}
+        self._cleaned_categories_by_column = {
+            key: set(value) for key, value in _fitted_output.items()
+        }
 
         return self
 
@@ -172,7 +175,10 @@ def fit(self, data: pd.DataFrame, column_names: list, target_column: str):
 
         for column_name in tqdm(column_names, desc="Fitting category regrouping..."):
             if column_name not in data.columns:
-                log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name))
+                log.warning(
+                    "DataFrame has no column '{}', so it will be "
+                    "skipped in fitting".format(column_name)
+                )
                 continue
 
             cleaned_cats = self._fit_column(data, column_name, target_column)
@@ -207,7 +213,10 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> se
         model_type = self.model_type
 
         if len(data[column_name].unique()) == 1:
-            log.warning(f"Predictor {column_name} is constant" " and will be ignored in computation.")
+            log.warning(
+                f"Predictor {column_name} is constant"
+                " and will be ignored in computation."
+            )
             return set(data[column_name].unique())
 
         y = data[target_column]
@@ -225,19 +234,25 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> se
 
         # do not merge categories in case of dummies, i.e. 0 and 1
         # (and possibly "Missing")
-        if len(unique_categories) == 2 or (len(unique_categories) == 3 and "Missing" in unique_categories):
+        if len(unique_categories) == 2 or (
+            len(unique_categories) == 3 and "Missing" in unique_categories
+        ):
             return set(unique_categories)
 
         # get small categories and add them to the merged category list
         # does not apply incidence factor when model_type = "regression"
-        small_categories = CategoricalDataProcessor._get_small_categories(X, incidence, self.category_size_threshold)
+        small_categories = CategoricalDataProcessor._get_small_categories(
+            X, incidence, self.category_size_threshold
+        )
         combined_categories = combined_categories.union(small_categories)
 
         for category in unique_categories:
             if category in small_categories:
                 continue
 
-            pval = CategoricalDataProcessor._compute_p_value(X, y, category, model_type, self.scale_contingency_table)
+            pval = CategoricalDataProcessor._compute_p_value(
+                X, y, category, model_type, self.scale_contingency_table
+            )
 
             # if not significant, add it to the list
             if pval > self.p_value_threshold:
@@ -266,7 +281,10 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
             Data with additional transformed variables.
         """
         if self.regroup and len(self._cleaned_categories_by_column) == 0:
-            msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+            msg = (
+                "{} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column_name in column_names:
@@ -298,7 +316,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram
         data.loc[:, column_name_clean] = data[column_name].astype(object)
 
         # Fill missings first
-        data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings(data, column_name_clean)
+        data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings(
+            data, column_name_clean
+        )
 
         if self.regroup:
             categories = self._cleaned_categories_by_column.get(column_name)
@@ -307,10 +327,15 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram
                 # Log warning if categories is None, which indicates it is
                 # not in fitted output
                 if categories is None:
-                    log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name))
+                    log.warning(
+                        "Column '{}' is not in fitted output "
+                        "and will be skipped".format(column_name)
+                    )
                 return data
 
-            data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_categories(
+            data.loc[
+                :, column_name_clean
+            ] = CategoricalDataProcessor._replace_categories(
                 data[column_name_clean], categories, self.regroup_name
             )
 
@@ -319,7 +344,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram
 
         return data
 
-    def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame:
+    def fit_transform(
+        self, data: pd.DataFrame, column_names: list, target_column: str
+    ) -> pd.DataFrame:
         """Fit and transform the data.
 
         Parameters
@@ -341,7 +368,9 @@ def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: s
         return self.transform(data, column_names)
 
     @staticmethod
-    def _get_small_categories(predictor_series: pd.Series, incidence: float, category_size_threshold: int) -> set:
+    def _get_small_categories(
+        predictor_series: pd.Series, incidence: float, category_size_threshold: int
+    ) -> set:
         """
         Fetch categories with a size below a certain threshold.
 
@@ -372,7 +401,9 @@ def _get_small_categories(predictor_series: pd.Series, incidence: float, categor
         return set(category_counts[bool_mask].index.tolist())
 
     @staticmethod
-    def _replace_missings(data: pd.DataFrame, column_names: Optional[Union[list[str], str]] = None) -> pd.DataFrame:
+    def _replace_missings(
+        data: pd.DataFrame, column_names: Optional[Union[list[str], str]] = None
+    ) -> pd.DataFrame:
         """Replace missing values (incl. empty strings).
 
         Parameters
@@ -403,7 +434,11 @@ def _replace_missings(data: pd.DataFrame, column_names: Optional[Union[list[str]
 
     @staticmethod
     def _compute_p_value(
-        X: pd.Series, y: pd.Series, category: str, model_type: str, scale_contingency_table: bool
+        X: pd.Series,
+        y: pd.Series,
+        category: str,
+        model_type: str,
+        scale_contingency_table: bool,
     ) -> float:
         """
         Calculate p-value.
@@ -439,7 +474,9 @@ def _compute_p_value(
         df["other_categories"] = np.where(X == category, 0, 1)
 
         if model_type == "classification":
-            contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], margins=False)
+            contingency_table = pd.crosstab(
+                index=df["other_categories"], columns=df["y"], margins=False
+            )
 
             # if true, we scale the "other" categories
             if scale_contingency_table:
@@ -453,12 +490,16 @@ def _compute_p_value(
             pval = stats.chi2_contingency(contingency_table, correction=False)[1]
 
         elif model_type == "regression":
-            pval = stats.kruskal(df.y[df.other_categories == 0], df.y[df.other_categories == 1])[1]
+            pval = stats.kruskal(
+                df.y[df.other_categories == 0], df.y[df.other_categories == 1]
+            )[1]
 
         return pval
 
     @staticmethod
-    def _replace_categories(data: pd.Series, categories: set, replace_with: str) -> pd.Series:
+    def _replace_categories(
+        data: pd.Series, categories: set, replace_with: str
+    ) -> pd.Series:
         """
         Replace categories in set with "Other".
 
diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py
index 0ad6265..84fae51 100644
--- a/cobra/preprocessing/kbins_discretizer.py
+++ b/cobra/preprocessing/kbins_discretizer.py
@@ -137,7 +137,8 @@ def attributes_to_dict(self) -> dict:
         params = self.get_params()
 
         params["_bins_by_column"] = {
-            key: [list(tup) for tup in value] if value else None for key, value in self._bins_by_column.items()
+            key: [list(tup) for tup in value] if value else None
+            for key, value in self._bins_by_column.items()
         }
 
         return params
@@ -160,7 +161,9 @@ def set_attributes_from_dict(self, params: dict):
 
         if type(_bins_by_column) != dict:
             raise ValueError(
-                "_bins_by_column is expected to be a dict but is of type {} instead".format(type(_bins_by_column))
+                "_bins_by_column is expected to be a dict but is of type {} instead".format(
+                    type(_bins_by_column)
+                )
             )
 
         # Clean out params dictionary to remove unknown keys (for safety!)
@@ -171,7 +174,8 @@ def set_attributes_from_dict(self, params: dict):
         self.set_params(**params)
 
         self._bins_by_column = {
-            key: ([tuple(v) for v in value] if value else None) for key, value in _bins_by_column.items()
+            key: ([tuple(v) for v in value] if value else None)
+            for key, value in _bins_by_column.items()
         }
 
         return self
@@ -195,7 +199,10 @@ def fit(self, data: pd.DataFrame, column_names: list):
 
         for column_name in tqdm(column_names, desc="Computing discretization bins..."):
             if column_name not in data.columns:
-                log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name))
+                log.warning(
+                    "DataFrame has no column '{}', so it will be "
+                    "skipped in fitting".format(column_name)
+                )
                 continue
 
             bins = self._fit_column(data, column_name)
@@ -203,7 +210,9 @@ def fit(self, data: pd.DataFrame, column_names: list):
             # Add to bins_by_column for later use
             self._bins_by_column[column_name] = bins
 
-    def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tuple]]:
+    def _fit_column(
+        self, data: pd.DataFrame, column_name: str
+    ) -> Optional[List[tuple]]:
         """Compute bins for a specific column in data.
 
         Parameters
@@ -221,7 +230,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tup
         col_min, col_max = data[column_name].min(), data[column_name].max()
 
         if col_min == col_max:
-            log.warning("Predictor '{}' is constant and will be ignored in computation".format(column_name))
+            log.warning(
+                "Predictor '{}' is constant and will be ignored in computation".format(
+                    column_name
+                )
+            )
             return None
 
         prop_inf = np.sum(np.isinf(data[column_name])) / data[column_name].shape[0]
@@ -237,7 +250,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tup
         prop_nan = data[column_name].isna().sum() / data[column_name].shape[0]
 
         if prop_nan >= 0.99:
-            log.warning(f"Column {column_name} is" f" {prop_nan:.1%}% NaNs, " f"consider dropping or transforming it.")
+            log.warning(
+                f"Column {column_name} is"
+                f" {prop_nan:.1%}% NaNs, "
+                f"consider dropping or transforming it."
+            )
 
         n_bins = self.n_bins
         if self.auto_adapt_bins:
@@ -249,7 +266,8 @@ def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tup
 
         if len(bin_edges) < 3:
             log.warning(
-                "Only 1 bin was found for predictor '{}' so it will " "be ignored in computation".format(column_name)
+                "Only 1 bin was found for predictor '{}' so it will "
+                "be ignored in computation".format(column_name)
             )
             return None
 
@@ -281,12 +299,18 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
             data with additional discretized variables
         """
         if len(self._bins_by_column) == 0:
-            msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+            msg = (
+                "{} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
+            )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
         for column_name in tqdm(column_names, desc="Discretizing columns..."):
             if column_name not in self._bins_by_column:
-                log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name))
+                log.warning(
+                    "Column '{}' is not in fitted output "
+                    "and will be skipped".format(column_name)
+                )
                 continue
 
             # can be None for a column with a constant value!
@@ -296,7 +320,9 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
 
         return data
 
-    def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tuple]) -> pd.DataFrame:
+    def _transform_column(
+        self, data: pd.DataFrame, column_name: str, bins: List[tuple]
+    ) -> pd.DataFrame:
         """Create a new column with binned values of column_name.
 
         Parameters
@@ -323,7 +349,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tup
         # Rename bins so that the output has a proper format
         bin_labels = self._create_bin_labels(bins)
 
-        data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories(bin_labels)
+        data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories(
+            bin_labels
+        )
 
         if data[column_name_bin].isnull().sum() > 0:
 
@@ -355,7 +383,12 @@ def fit_transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
         return self.transform(data, column_names)
 
     def _compute_bin_edges(
-        self, data: pd.DataFrame, column_name: str, n_bins: int, col_min: float, col_max: float
+        self,
+        data: pd.DataFrame,
+        column_name: str,
+        n_bins: int,
+        col_min: float,
+        col_max: float,
     ) -> list:
         """Compute the desired bin edges.
 
@@ -475,7 +508,9 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:
         return bins
 
     @staticmethod
-    def _create_index(intervals: List[tuple], closed: str = "right") -> pd.IntervalIndex:
+    def _create_index(
+        intervals: List[tuple], closed: str = "right"
+    ) -> pd.IntervalIndex:
         """
         Create an pd.IntervalIndex based on a list of tuples.
 
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index 3a82efa..c873b68 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -169,7 +169,13 @@ def from_params(
         )
 
         discretizer = KBinsDiscretizer(
-            n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format
+            n_bins,
+            strategy,
+            closed,
+            auto_adapt_bins,
+            starting_precision,
+            label_format,
+            change_endpoint_format,
         )
 
         target_encoder = TargetEncoder(weight, imputation_strategy)
@@ -201,10 +207,15 @@ def from_pipeline(cls, pipeline: dict):
             and no others.
         """
         if not PreProcessor._is_valid_pipeline(pipeline):
-            raise ValueError("Invalid pipeline, as it does not " "contain all and only the required parameters.")
+            raise ValueError(
+                "Invalid pipeline, as it does not "
+                "contain all and only the required parameters."
+            )
 
         categorical_data_processor = CategoricalDataProcessor()
-        categorical_data_processor.set_attributes_from_dict(pipeline["categorical_data_processor"])
+        categorical_data_processor.set_attributes_from_dict(
+            pipeline["categorical_data_processor"]
+        )
         # model_type = categorical_data_processor.model_type
 
         discretizer = KBinsDiscretizer()
@@ -213,9 +224,20 @@ def from_pipeline(cls, pipeline: dict):
         target_encoder = TargetEncoder()
         target_encoder.set_attributes_from_dict(pipeline["target_encoder"])
 
-        return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"])
+        return cls(
+            categorical_data_processor,
+            discretizer,
+            target_encoder,
+            is_fitted=pipeline["_is_fitted"],
+        )
 
-    def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str):
+    def fit(
+        self,
+        train_data: pd.DataFrame,
+        continuous_vars: list,
+        discrete_vars: list,
+        target_column_name: str,
+    ):
         """Fit the data to the preprocessing pipeline.
 
         Parameters
@@ -230,7 +252,9 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: li
             Column name of the target.
         """
         # get list of all variables
-        preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
+        preprocessed_variable_names = PreProcessor._get_variable_list(
+            continuous_vars, discrete_vars
+        )
 
         log.info("Starting to fit pipeline")
         start = time.time()
@@ -244,25 +268,39 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: li
         if continuous_vars:
             begin = time.time()
             self._discretizer.fit(train_data, continuous_vars)
-            log.info("Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin))
+            log.info(
+                "Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin)
+            )
 
             train_data = self._discretizer.transform(train_data, continuous_vars)
         if discrete_vars:
             begin = time.time()
-            self._categorical_data_processor.fit(train_data, discrete_vars, target_column_name)
-            log.info("Fitting categorical_data_processor class took {} seconds".format(time.time() - begin))
+            self._categorical_data_processor.fit(
+                train_data, discrete_vars, target_column_name
+            )
+            log.info(
+                "Fitting categorical_data_processor class took {} seconds".format(
+                    time.time() - begin
+                )
+            )
 
-            train_data = self._categorical_data_processor.transform(train_data, discrete_vars)
+            train_data = self._categorical_data_processor.transform(
+                train_data, discrete_vars
+            )
 
         begin = time.time()
-        self._target_encoder.fit(train_data, preprocessed_variable_names, target_column_name)
+        self._target_encoder.fit(
+            train_data, preprocessed_variable_names, target_column_name
+        )
         log.info("Fitting TargetEncoder took {} seconds".format(time.time() - begin))
 
         self._is_fitted = True  # set fitted boolean to True
 
         log.info("Fitting pipeline took {} seconds".format(time.time() - start))
 
-    def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list) -> pd.DataFrame:
+    def transform(
+        self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list
+    ) -> pd.DataFrame:
         """Transform the data by applying the preprocessing pipeline.
 
         Parameters
@@ -288,11 +326,14 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: li
 
         if not self._is_fitted:
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
-        preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
+        preprocessed_variable_names = PreProcessor._get_variable_list(
+            continuous_vars, discrete_vars
+        )
 
         if continuous_vars:
             data = self._discretizer.transform(data, continuous_vars)
@@ -307,7 +348,11 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: li
         return data
 
     def fit_transform(
-        self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str
+        self,
+        train_data: pd.DataFrame,
+        continuous_vars: list,
+        discrete_vars: list,
+        target_column_name: str,
     ) -> pd.DataFrame:
         """Fit preprocessing pipeline and transform the data.
 
@@ -333,7 +378,10 @@ def fit_transform(
 
     @staticmethod
     def train_selection_validation_split(
-        data: pd.DataFrame, train_prop: float = 0.6, selection_prop: float = 0.2, validation_prop: float = 0.2
+        data: pd.DataFrame,
+        train_prop: float = 0.6,
+        selection_prop: float = 0.2,
+        validation_prop: float = 0.2,
     ) -> pd.DataFrame:
         """Add `split` column with train/selection/validation values to the dataset.
 
@@ -358,7 +406,10 @@ def train_selection_validation_split(
             DataFrame with additional split column.
         """
         if not math.isclose(train_prop + selection_prop + validation_prop, 1.0):
-            raise ValueError("The sum of train_prop, selection_prop and " "validation_prop must be 1.0.")
+            raise ValueError(
+                "The sum of train_prop, selection_prop and "
+                "validation_prop must be 1.0."
+            )
 
         if train_prop == 0.0:
             raise ValueError("train_prop cannot be zero!")
@@ -373,7 +424,10 @@ def train_selection_validation_split(
         correction = nrows - (size_train + size_select + size_valid)
 
         split = (
-            ["train"] * size_train + ["train"] * correction + ["selection"] * size_select + ["validation"] * size_valid
+            ["train"] * size_train
+            + ["train"] * correction
+            + ["selection"] * size_select
+            + ["validation"] * size_valid
         )
 
         shuffle(split)
@@ -395,9 +449,13 @@ def serialize_pipeline(self) -> dict[str, Any]:
             Return the pipeline as a dictionary.
         """
         pipeline: dict[str, Any]
-        pipeline = {"metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")}}
+        pipeline = {
+            "metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")}
+        }
 
-        pipeline["categorical_data_processor"] = self._categorical_data_processor.attributes_to_dict()
+        pipeline[
+            "categorical_data_processor"
+        ] = self._categorical_data_processor.attributes_to_dict()
 
         pipeline["discretizer"] = self._discretizer.attributes_to_dict()
         pipeline["target_encoder"] = self._target_encoder.attributes_to_dict()
@@ -416,7 +474,9 @@ def _is_valid_pipeline(pipeline: dict) -> bool:
             Loaded pipeline from JSON file.
         """
         keys = inspect.getfullargspec(PreProcessor.from_params).args
-        valid_keys = set([key for key in keys if key not in ["cls", "serialization_path"]])
+        valid_keys = set(
+            [key for key in keys if key not in ["cls", "serialization_path"]]
+        )
 
         input_keys: Set[str] = set()
         for key in pipeline:
@@ -453,7 +513,9 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list:
         ValueError
             In case both lists are empty.
         """
-        var_list = [col + "_processed" for col in discrete_vars] + [col + "_bin" for col in continuous_vars]
+        var_list = [col + "_processed" for col in discrete_vars] + [
+            col + "_bin" for col in continuous_vars
+        ]
 
         if not var_list:
             raise ValueError("Variable var_list is None or empty list.")
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index cd6bc34..7cd3f6a 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -75,7 +75,9 @@ def __init__(self, weight: float = 0.0, imputation_strategy: str = "mean"):
         elif imputation_strategy not in self.valid_imputation_strategies:
             raise ValueError(
                 "Valid options for 'imputation_strategy' are {}. "
-                "Got imputation_strategy={!r} instead.".format(self.valid_imputation_strategies, imputation_strategy)
+                "Got imputation_strategy={!r} instead.".format(
+                    self.valid_imputation_strategies, imputation_strategy
+                )
             )
 
         if weight == 0:
@@ -104,7 +106,9 @@ def attributes_to_dict(self) -> dict:
         """
         params = self.get_params()
 
-        params["_mapping"] = {key: value.to_dict() for key, value in self._mapping.items()}
+        params["_mapping"] = {
+            key: value.to_dict() for key, value in self._mapping.items()
+        }
 
         params["_global_mean"] = self._global_mean
 
@@ -122,7 +126,10 @@ def set_attributes_from_dict(self, params: dict):
         if "weight" in params and type(params["weight"]) == float:
             self.weight = params["weight"]
 
-        if "imputation_strategy" in params and params["imputation_strategy"] in self.valid_imputation_strategies:
+        if (
+            "imputation_strategy" in params
+            and params["imputation_strategy"] in self.valid_imputation_strategies
+        ):
             self.imputation_strategy = params["imputation_strategy"]
 
         if "_global_mean" in params and type(params["_global_mean"]) == float:
@@ -137,7 +144,9 @@ def dict_to_series(key, value):
             s.index.name = key
             return s
 
-        self._mapping = {key: dict_to_series(key, value) for key, value in _mapping.items()}
+        self._mapping = {
+            key: dict_to_series(key, value) for key, value in _mapping.items()
+        }
 
         return self
 
@@ -160,7 +169,10 @@ def fit(self, data: pd.DataFrame, column_names: list, target_column: str):
 
         for column in tqdm(column_names, desc="Fitting target encoding..."):
             if column not in data.columns:
-                log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column))
+                log.warning(
+                    "DataFrame has no column '{}', so it will be "
+                    "skipped in fitting".format(column)
+                )
                 continue
 
             self._mapping[column] = self._fit_column(data[column], y)
@@ -220,7 +232,8 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
         """
         if (len(self._mapping) == 0) or (self._global_mean is None):
             msg = (
-                "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method."
+                "This {} instance is not fitted yet. Call 'fit' with "
+                "appropriate arguments before using this method."
             )
             raise NotFittedError(msg.format(self.__class__.__name__))
 
@@ -229,7 +242,10 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame:
                 log.warning("Unknown column '{}' will be skipped.".format(column))
                 continue
             elif column not in self._mapping:
-                log.warning("Column '{}' is not in fitted output " "and will be skipped.".format(column))
+                log.warning(
+                    "Column '{}' is not in fitted output "
+                    "and will be skipped.".format(column)
+                )
                 continue
             data = self._transform_column(data, column)
 
@@ -258,7 +274,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram
         # Convert dtype to float, because when the original dtype
         # is of type "category", the resulting dtype would otherwise also be of
         # type "category":
-        data[new_column] = data[column_name].map(self._mapping[column_name]).astype("float")
+        data[new_column] = (
+            data[column_name].map(self._mapping[column_name]).astype("float")
+        )
 
         # In case of categorical data, it could be that new categories will
         # emerge which were not present in the train set, so this will result
@@ -274,7 +292,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram
 
         return data
 
-    def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame:
+    def fit_transform(
+        self, data: pd.DataFrame, column_names: list, target_column: str
+    ) -> pd.DataFrame:
         """Fit the encoder and transform the data.
 
         Parameters
diff --git a/cobra/utils.py b/cobra/utils.py
index 0287947..4efee0d 100644
--- a/cobra/utils.py
+++ b/cobra/utils.py
@@ -8,4 +8,6 @@ def clean_predictor_name(predictor_name: str) -> str:
     This is done by stripping the redundant suffix (e.g. "_enc" or "_bin") off
     from the end of the predictor name to return a clean version of the predictor
     """
-    return predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
+    return (
+        predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
+    )

From ae2296d06a718329b4bbfe89d332d69ca702ad6f Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 24 Jun 2022 15:52:43 +0200
Subject: [PATCH 7/9] chore: remove line length of 120, default is 88

---
 Makefile                      | 2 +-
 cobra/evaluation/evaluator.py | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index c0b115b..3b20397 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ test-unit:
 	@echo 'unit tests OK'
 
 black-check:
-	black --diff --line-length 80 cobra/
+	black --diff cobra/
 
 black:
 	black cobra/
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index 278bf9d..985cd63 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -787,16 +787,12 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)):
 
             ax.set_xlabel("Theoretical quantiles", fontsize=15)
             ax.set_xticks(
-                range(
-                    int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1
-                )
+                range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1)
             )
 
             ax.set_ylabel("Standardized residuals", fontsize=15)
             ax.set_yticks(
-                range(
-                    int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1
-                )
+                range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1)
             )
 
             ax.legend(loc="best")

From 816a0449d81b51a9c9ed870a969ddf4d8f951840 Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Tue, 12 Jul 2022 14:35:36 +0200
Subject: [PATCH 8/9] chore: Python version 3.9 to work with typings

---
 .github/workflows/development_CI.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/development_CI.yaml b/.github/workflows/development_CI.yaml
index e0f18f8..100c71e 100644
--- a/.github/workflows/development_CI.yaml
+++ b/.github/workflows/development_CI.yaml
@@ -16,10 +16,10 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
       uses: actions/setup-python@v2
       with:
-        python-version: 3.8
+        python-version: 3.9
 
     - name: Install dependencies
       run: |

From 9b60ffe3fd2fa8d01eeaf16589411fb75996881b Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Tue, 12 Jul 2022 14:36:26 +0200
Subject: [PATCH 9/9] chore: add pytest-cov

---
 requirements.dev.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.dev.txt b/requirements.dev.txt
index 9534dc0..dc2121b 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -3,4 +3,5 @@ mypy>=0.942
 pycodestyle>=2.8.0
 pydocstyle>=6.1.1
 pytest>=7.1.1
-pytest-mock>=3.7.0
\ No newline at end of file
+pytest-mock>=3.7.0
+pytest-cov>=3.0.0
\ No newline at end of file