New feature: add information criteria for model diagnostics (#519)

* add the aic & bic computations to the GeneralizedLinearRegressor and GeneralizedLinearRegressorBase classes * update doc string and neaten _compute_information_criteria method * add the reference link * fix the df warning when using ridge regression * update CHANGELOG to include this change, extend the test to look for warnings and remove unnecessary warning * put date in CHANGELOG * slight update to doc string on _set_up_for_info_criteria * Update CHANGELOG.rst Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> * use the TweedieDistribution in checking the accepted family for aic/bic computation and remove the set_up / tear_down code * fix rendering of the doc string for readthedocs * get the number of observations in a more robust way Co-authored-by: Marc-Antoine Schmidt <marc-antoine.schmidt@quantco.com> * Fix spelling of 'criterion' to plural 'criteria' in docstring Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> * fix typo in docstring Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> * fix incorrect space in doc string * fix CHANGELOG.rst to document all changes that are being tracked in current release * calling for aic, bic or aicc before the model has been fit should raise an exception * remove the set up and tear down for info criteria methods as these are unnecessarily complicated * oops. definitely not removing this one * fix silly bug where _info_criteria dictionary was not initialised * restructure the information criteria api so that we pass the data in as arguments to the method * update the doc strings to reflect how degrees of freedom are computed and put warning back in for L2 penalty * we only change the GeneralizedLinearRegressor * put deleted line break back in * add parameters to doc string and make tests more readable * move _num_obs to a more sensible place in _set_up_for_fit * fix that the warnings show correctly and that they don't print out to terminal when running the tests * update to make get_info_criteria private, throw error on None aicc and note that test values are from statsmodels * change the error message slightly to be clearer * explicitly use statsmodels calculations for aic, bic and aicc in test Co-authored-by: Nick Hoernle <nicholas.hoernle@gmail.com> Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Co-authored-by: Marc-Antoine Schmidt <marc-antoine.schmidt@quantco.com>
Quantco · Mar 7, 2022 · a5c8f15 · a5c8f15
1 parent 55d3f08
commit a5c8f15
Show file tree

Hide file tree

Showing 3 changed files with 240 additions and 1 deletion.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,7 +7,7 @@
 Changelog
 =========
 
-2.0.4 - 202X-XX-XX
+Unreleased
 ------------------
 
 **Bug fix:**
@@ -19,6 +19,12 @@ Changelog
 - The CI now runs daily unit tests against the nightly builds of numpy, pandas and scikit-learn.
 - ``oldest-supported-numpy`` is used for build. 
 
+**New feature:**
+
+- Added :meth:`aic`, :meth:`aicc` and :meth:`bic` attributes to the :class:`~glum.GeneralizedLinearRegressor`. 
+  These attributes provide the information criteria based on the training data and the effective degrees of freedom 
+  of the maximum likelihood estimate for the model's parameters.
+
 2.0.3 - 2021-11-05
 ------------------
 

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -790,6 +790,10 @@ def _set_up_for_fit(self, y: np.ndarray) -> None:
         # substantially change estimates
         self._center_predictors: bool = self.fit_intercept
 
+        # require number of observations in the training data for later
+        # computation of information criteria
+        self._num_obs: int = y.shape[0]
+
         if self.solver == "auto":
             if (self.A_ineq is not None) and (self.b_ineq is not None):
                 self._solver = "trust-constr"
@@ -2474,3 +2478,177 @@ def fit(
         self._tear_down_from_fit()
 
         return self
+
+    def _compute_information_criteria(
+        self,
+        X: ShapedArrayLike,
+        y: ShapedArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+    ):
+        """
+        Computes and stores the model's degrees of freedom, the 'aic', 'aicc'
+        and 'bic' information criteria.
+
+        The model's degrees of freedom are used to calculate the effective
+        number of parameters. This uses the claim by [2] and [3] that, for
+        L1 regularisation, the number of non-zero parameters in the trained model
+        is an unbiased approximator of the degrees of freedom of the model. Note
+        that this might not hold true for L2 regularisation and thus we raise a
+        warning for this case.
+
+        References
+        ----------
+        [1] Burnham KP, Anderson KR (2002). Model Selection and Multimodel
+        Inference; Springer New York.
+        [2] Zou, H., Hastie, T. and Tibshirani, R., (2007). On the “degrees of
+        freedom” of the lasso; The Annals of Statistics.
+        [3] Park, M.Y., 2006. Generalized linear models with regularization;
+        Stanford Universty.
+        """
+
+        # we require that the log_likelihood be defined
+        model_err_str = (
+            "The computation of the information criteria has only "
+            + "been defined for models with a Binomial likelihood or a Tweedie "
+            + "likelihood with power <= 2."
+        )
+        if not isinstance(
+            self.family_instance, (BinomialDistribution, TweedieDistribution)
+        ):
+            raise NotImplementedError(model_err_str)
+
+        # the log_likelihood has not been implemented for the InverseGaussianDistribution
+        if (
+            isinstance(self.family_instance, TweedieDistribution)
+            and self.family_instance.power > 2
+        ):
+            raise NotImplementedError(model_err_str)
+
+        ddof = np.sum(np.abs(self.coef_) > np.finfo(self.coef_.dtype).eps)
+        k_params = ddof + self.fit_intercept
+        nobs = X.shape[0]
+
+        if nobs != self._num_obs:
+            raise ValueError(
+                "The same dataset that was used for training should "
+                + "also be used for the computation of information "
+                + "criteria"
+            )
+
+        mu = self.predict(X)
+        ll = self.family_instance.log_likelihood(y, mu, sample_weight=sample_weight)
+
+        aic = -2 * ll + 2 * k_params
+        bic = -2 * ll + np.log(nobs) * k_params
+        if nobs > k_params + 1:
+            aicc = aic + 2 * k_params * (k_params + 1) / (nobs - k_params - 1)
+        else:
+            aicc = None
+
+        self._info_criteria = {"aic": aic, "aicc": aicc, "bic": bic}
+
+        return True
+
+    def aic(
+        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+    ):
+        """
+        Akaike's information criteria. Computed as:
+        :math:`-2\\log\\hat{\\mathcal{L}} + 2\\hat{k}` where
+        :math:`\\hat{\\mathcal{L}}` is the maximum likelihood estimate of the
+        model, and :math:`\\hat{k}` is the effective number of parameters. See
+        `_compute_information_criteria` for more information on the computation
+        of :math:`\\hat{k}`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Same data as used in 'fit'
+
+        y : array-like, shape (n_samples,)
+            Same data as used in 'fit'
+
+        sample_weight : array-like, shape (n_samples,), optional (default=None)
+             Same data as used in 'fit'
+        """
+        return self._get_info_criteria("aic", X, y, sample_weight)
+
+    def aicc(
+        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+    ):
+        """
+        Second-order Akaike's information criteria (or small sample AIC).
+        Computed as:
+        :math:`-2\\log\\hat{\\mathcal{L}} + 2\\hat{k} + \\frac{2k(k+1)}{n-k-1}`
+        where :math:`\\hat{\\mathcal{L}}` is the maximum likelihood estimate of
+        the model, :math:`n` is the number of training instances, and
+        :math:`\\hat{k}` is the effective number of parameters. See
+        `_compute_information_criteria` for more information on the computation
+        of :math:`\\hat{k}`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Same data as used in 'fit'
+
+        y : array-like, shape (n_samples,)
+            Same data as used in 'fit'
+
+        sample_weight : array-like, shape (n_samples,), optional (default=None)
+             Same data as used in 'fit'
+        """
+        aicc = self._get_info_criteria("aicc", X, y, sample_weight)
+        if not aicc:
+            raise ValueError(
+                "Model degrees of freedom should be more than training datapoints."
+            )
+        return aicc
+
+    def bic(
+        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+    ):
+        """
+        Bayesian information criterion. Computed as:
+        :math:`-2\\log\\hat{\\mathcal{L}} + k\\log(n)` where
+        :math:`\\hat{\\mathcal{L}}` is the maximum likelihood estimate of the
+        model, :math:`n` is the number of training instances, and
+        :math:`\\hat{k}` is the effective number of parameters. See
+        `_compute_information_criteria` for more information on the computation
+        of :math:`\\hat{k}`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Same data as used in 'fit'
+
+        y : array-like, shape (n_samples,)
+            Same data as used in 'fit'
+
+        sample_weight : array-like, shape (n_samples,), optional (default=None)
+             Same data as used in 'fit'
+        """
+        return self._get_info_criteria("bic", X, y, sample_weight)
+
+    def _get_info_criteria(
+        self,
+        crit: str,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+    ):
+
+        check_is_fitted(self, "coef_")
+
+        if not hasattr(self, "_info_criteria"):
+            self._compute_information_criteria(X, y, sample_weight)
+
+        if (
+            self.alpha is None or (self.alpha is not None and self.alpha > 0)
+        ) and self.l1_ratio < 1.0:
+            warnings.warn(
+                "There is no general definition for the model's degrees of "
+                + f"freedom under L2 (ridge) regularisation. The {crit} "
+                + "might not be well defined in these cases."
+            )
+
+        return self._info_criteria[crit]
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
@@ -2,6 +2,7 @@
 #
 # License: BSD 3 clause
 import copy
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -17,6 +18,7 @@
 from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
 from sklearn.metrics import mean_absolute_error
 from sklearn.utils.estimator_checks import check_estimator
+from statsmodels.tools import eval_measures
 
 from glum import GeneralizedLinearRegressorCV
 from glum._distribution import (
@@ -1788,3 +1790,56 @@ def test_score_method(as_data_frame, offset, weighted):
 
     # use pytest because NumPy used to always reject comparisons against zero
     assert pytest.approx(score, 1e-8) == int(offset is not None)
+
+
+def test_information_criteria(regression_data):
+    X, y = regression_data
+    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor.fit(X, y)
+
+    llf = regressor.family_instance.log_likelihood(y, regressor.predict(X))
+    nobs, df = X.shape[0], X.shape[1] + 1
+    sm_aic = eval_measures.aic(llf, nobs, df)
+    sm_bic = eval_measures.bic(llf, nobs, df)
+    sm_aicc = eval_measures.aicc(llf, nobs, df)
+
+    assert np.allclose(
+        [sm_aic, sm_aicc, sm_bic],
+        [regressor.aic(X, y), regressor.aicc(X, y), regressor.bic(X, y)],
+        atol=1e-8,
+    )
+
+
+@pytest.mark.filterwarnings("ignore: There is no")
+def test_information_criteria_raises_correct_warnings_and_errors(regression_data):
+    X, y = regression_data
+
+    # test no warnings are raised for L1 regularisation
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        regressor = GeneralizedLinearRegressor(family="normal", l1_ratio=1.0)
+        regressor.fit(X, y)
+        regressor.aic(X, y), regressor.aicc(X, y), regressor.bic(X, y)
+
+    # test warnings are raised for L2 regularisation
+    with pytest.warns(match="There is no") as records:
+        regressor = GeneralizedLinearRegressor(family="normal", l1_ratio=0.0)
+        regressor.fit(X, y)
+        regressor.aic(X, y), regressor.aicc(X, y), regressor.bic(X, y)
+    assert len(records) == 3
+
+    # test exceptions are raised when information criteria called but model not fitted
+    regressor = GeneralizedLinearRegressor()
+    with pytest.raises(Exception):
+        regressor.aic(X, y)
+    with pytest.raises(Exception):
+        regressor.aicc(X, y)
+    with pytest.raises(Exception):
+        regressor.bic(X, y)
+
+    # test exception is raised when not train set is used
+    regressor.fit(X, y)
+    X_not_train = np.ones((10, 2))
+    y_not_train = np.ones(10)
+    with pytest.raises(Exception):
+        regressor.aic(X_not_train, y_not_train)