Merge pull request statsmodels#1823 from josef-pkt/gee-cat-subclass_1…

…694_rebase REF: Gee cat subclass 1694 rebased, closes statsmodels#1694
PierreBdR · Jul 11, 2014 · 2081ff3 · 2081ff3
2 parents f8e702c + 8d206bb
commit 2081ff3
Show file tree

Hide file tree

Showing 4 changed files with 721 additions and 333 deletions.
diff --git a/docs/source/release/version0.6.rst b/docs/source/release/version0.6.rst
@@ -36,13 +36,13 @@ covariates.
    from statsmodels.genmod.generalized_estimating_equations import GEE
    from statsmodels.genmod.dependence_structures import Independence
    from statsmodels.genmod.families import Poisson
-   
+
    data_url = "http://vincentarelbundock.github.io/Rdatasets/csv/MASS/epil.csv"
    data = pd.read_csv(data_url)
-   
+
    fam = Poisson()
    ind = Independence()
-   md = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"],
+   md = GEE.from_formula("y ~ age + trt + base", "subject", data,
                           covstruct=ind, family=fam)
    mdf = md.fit()
    print mdf.summary()
@@ -96,7 +96,7 @@ We added a naive seasonal decomposition tool in the same vein as R's ``decompose
     import statsmodels.api as sm
 
     dta = sm.datasets.co2.load_pandas().data
-    # deal with missing values. see issue 
+    # deal with missing values. see issue
     dta.co2.interpolate(inplace=True)
 
     res = sm.tsa.seasonal_decompose(dta.co2)

diff --git a/statsmodels/genmod/dependence_structures/covstruct.py b/statsmodels/genmod/dependence_structures/covstruct.py
@@ -1,6 +1,10 @@
 from statsmodels.compat.python import iterkeys, itervalues, zip, range
+from statsmodels.stats.correlation_tools import cov_nearest
 import numpy as np
 from scipy import linalg as spl
+from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
+                                             IterationLimitWarning)
+import warnings
 
 
 class CovStruct(object):
@@ -24,6 +28,10 @@ def __init__(self):
         # Parameters describing the dependency structure
         self.dep_params = None
 
+        self.cov_adjust = 0
+
+
+
     def initialize(self, model):
         """
         Called by GEE, used by implementations that need additional
@@ -49,7 +57,8 @@ def update(self, params):
         raise NotImplementedError
 
     def covariance_matrix(self, endog_expval, index):
-        """Returns the working covariance or correlation matrix for a
+        """
+        Returns the working covariance or correlation matrix for a
         given cluster of data.
 
         Parameters
@@ -105,6 +114,10 @@ def covariance_matrix_solve(self, expval, index, stdev, rhs):
         (e.g. binomial) do not use the `stdev` parameter when forming
         the covariance matrix.
 
+        If the covariance matrix is singular or not SPD, it is
+        projected to the nearest such matrix.  These projection events
+        are recorded in the fit_history member of the GEE model.
+
         Systems of linear equations with the covariance matrix as the
         left hand side (LHS) are solved for different right hand sides
         (RHS); the LHS is only factorized once to save time.
@@ -119,9 +132,25 @@ def covariance_matrix_solve(self, expval, index, stdev, rhs):
         if is_cor:
             vmat *= np.outer(stdev, stdev)
 
-        try:
-            vco = spl.cho_factor(vmat)
-        except np.linalg.LinAlgError:
+        # Factor, the covariance matrix.  If the factorization fails,
+        # attempt to condition it into a factorizable matrix.
+        threshold = 1e-2
+        success = False
+        cov_adjust = 0
+        for itr in range(10):
+            try:
+                vco = spl.cho_factor(vmat)
+                success = True
+                break
+            except np.linalg.LinAlgError:
+                vmat = cov_nearest(vmat, method="nearest", threshold=threshold)
+                threshold *= 2
+                cov_adjust = 1
+
+        self.cov_adjust += cov_adjust
+        if success == False:
+            warnings.warn("Unable to condition covariance matrix to an SPD matrix",
+                          ConvergenceWarning)
             return None
 
         soln = [spl.cho_solve(vco, x) for x in rhs]
@@ -163,7 +192,7 @@ def covariance_matrix_solve(self, expval, index, stdev, rhs):
     covariance_matrix_solve.__doc__ = CovStruct.covariance_matrix_solve.__doc__
 
     def summary(self):
-        return "Observations within a cluster are independent."
+        return "Observations within a cluster are modeled as being independent."
 
 
 class Exchangeable(CovStruct):
@@ -668,10 +697,6 @@ def __init__(self, endog_type):
 
     def initialize(self, model):
 
-        # Delay processing until model.setup_ordinal has been called.
-        if model.ordinal == False and model.nominal == False:
-            return
-
         super(GlobalOddsRatio, self).initialize(model)
 
         self.nlevel = len(model.endog_values)
@@ -685,19 +710,30 @@ def initialize(self, model):
             ibd.append(ibd1)
         self.ibd = ibd
 
+        # Need to restrict to between-subject pairs
         cpp = []
         for v in model.endog_li:
-            m = len(v) / self.ncut
-            jj = np.kron(np.ones(m), np.arange(self.ncut))
-            j1 = np.outer(jj, np.ones(len(jj)))
-            j2 = np.outer(np.ones(len(jj)), jj)
+
+            # Number of subjects in this group
+            m = int(len(v) / self.ncut)
+
             cpp1 = {}
-            for k1 in range(self.ncut):
-                for k2 in range(k1+1):
-                    v1, v2 = np.nonzero((j1 == k1) & (j2 == k2))
-                    cpp1[(k2, k1)] = \
-                        np.hstack((v2[:, None], v1[:, None]))
+            # Loop over distinct subject pairs
+            for i1 in range(m):
+                for i2 in range(i1):
+                    # Loop over cut point pairs
+                    for k1 in range(self.ncut):
+                        for k2 in range(k1+1):
+                            if (k2, k1) not in cpp1:
+                                cpp1[(k2, k1)] = []
+                            j1 = i1*self.ncut + k1
+                            j2 = i2*self.ncut + k2
+                            cpp1[(k2, k1)].append([j2, j1])
+
+            for k in cpp1.keys():
+                cpp1[k] = np.asarray(cpp1[k])
             cpp.append(cpp1)
+
         self.cpp = cpp
 
         # Initialize the dependence parameters
@@ -739,12 +775,13 @@ def covariance_matrix(self, expected_value, index):
         return vmat, False
 
     def observed_crude_oddsratio(self):
-        """The crude odds ratio is obtained by pooling all data
-        corresponding to a given pair of cut points (c,c'), then
-        forming the inverse variance weighted average of these odds
-        ratios to obtain a single OR.  Since the covariate effects are
-        ignored, this OR will generally be greater than the stratified
-        OR.
+        """
+        To obtain the crude (global) odds ratio, first pool all binary
+        indicators corresponding to a given pair of cut points (c,c'),
+        then calculate the odds ratio for this 2x2 table.  The crude
+        odds ratio is the inverse variance weighted average of these
+        odds ratios.  Since the covariate effects are ignored, this OR
+        will generally be greater than the stratified OR.
         """
 
         cpp = self.cpp
@@ -779,7 +816,7 @@ def get_eyy(self, endog_expval, index):
         """
         Returns a matrix V such that V[i,j] is the joint probability
         that endog[i] = 1 and endog[j] = 1, based on the marginal
-        probabilities of endog and the odds ratio `current_or`.
+        probabilities of endog and the global odds ratio `current_or`.
         """
 
         current_or = self.dep_params
@@ -810,8 +847,10 @@ def get_eyy(self, endog_expval, index):
         return vmat
 
     def update(self, params):
-        """Update the global odds ratio based on the current value of
-        params."""
+        """
+        Update the global odds ratio based on the current value of
+        params.
+        """
 
         endog = self.model.endog_li
         cpp = self.cpp
@@ -846,6 +885,11 @@ def update(self, params):
         cor_expval = self.pooled_odds_ratio(list(itervalues(tables)))
 
         self.dep_params *= self.crude_or / cor_expval
+        if not np.isfinite(self.dep_params):
+            self.dep_params = 1.
+            warnings.warn("dep_params became inf, resetting to 1",
+                          ConvergenceWarning)
+
 
     update.__doc__ = CovStruct.update.__doc__
     covariance_matrix.__doc__ = CovStruct.covariance_matrix.__doc__