Merge pull request #158 from X-DataInitiative/TICK-361

Add ConvSCCS model + refactoring of SCCS-related stuff (preprocessing, simulations, etc.)
X-DataInitiative · Mar 22, 2018 · 088393b · 088393b
2 parents 730d91b + 1f9c926
commit 088393b
Show file tree

Hide file tree

Showing 27 changed files with 2,334 additions and 615 deletions.
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -43,3 +43,4 @@ similar API to scikit-learn preprocessors.
 
    LongitudinalFeaturesProduct
    LongitudinalFeaturesLagger
+   LongitudinalSamplesFilter
diff --git a/doc/modules/survival.rst b/doc/modules/survival.rst
@@ -25,6 +25,7 @@ Kaplan-Meier estimators.
    CoxRegression
    nelson_aalen
    kaplan_meier
+   ConvSCCS
 
 2. Models
 =========
@@ -69,3 +70,6 @@ Example
 
 .. plot:: ../examples/plot_simulation_coxreg.py
     :include-source:
+
+.. plot:: ../examples/plot_conv_sccs_cv_results.py
+    :include-source:
diff --git a/examples/plot_conv_sccs_cv_results.py b/examples/plot_conv_sccs_cv_results.py
@@ -0,0 +1,145 @@
+"""
+====================================================================
+ConvSCCS cross validation on simulated longitudinal features example
+====================================================================
+
+In this example we simulate longitudinal data with preset relative incidence
+for each feature. We then perform a cross validation of the ConvSCCS model
+and compare the estimated coefficients to the relative incidences used for
+the simulation.
+"""
+from time import time
+import numpy as np
+from scipy.sparse import csr_matrix, hstack
+from matplotlib import cm
+import matplotlib.pylab as plt
+from tick.survival.simu_sccs import CustomEffects
+from tick.survival import SimuSCCS, ConvSCCS
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+
+# Simulation parameters
+seed = 0
+lags = 49
+n_samples = 2000
+n_intervals = 750
+n_corr = 3
+
+# Relative incidence functions used for the simulation
+ce = CustomEffects(lags + 1)
+null_effect = [ce.constant_effect(1)] * 2
+intermediate_effect = ce.bell_shaped_effect(2, 30, 15, 15)
+late_effects = ce.increasing_effect(2, curvature_type=4)
+
+sim_effects = [*null_effect, intermediate_effect, late_effects]
+
+n_features = len(sim_effects)
+n_lags = np.repeat(lags, n_features).astype('uint64')
+
+coeffs = [np.log(c) for c in sim_effects]
+
+# Time drift (age effect) used for the simulations.
+time_drift = lambda t: np.log(8 * np.sin(.01 * t) + 9)
+
+# Simaltion of the features.
+sim = SimuSCCS(n_samples, n_intervals, n_features, n_lags,
+               time_drift=time_drift, coeffs=coeffs, seed=seed,
+               n_correlations=n_corr, verbose=False)
+features, censored_features, labels, censoring, coeffs = sim.simulate()
+
+# Plot the Hawkes kernel matrix used to generate the features.
+fig, ax = plt.subplots(figsize=(7, 6))
+heatmap = ax.pcolor(sim.hawkes_exp_kernels.adjacency, cmap=cm.Blues)
+divider = make_axes_locatable(ax)
+cax = divider.append_axes("right", size="5%", pad=0.5)
+fig.colorbar(heatmap, cax=cax)
+ax.set_title('Hawkes adjacency matrix used for the simulation');
+plt.show()
+
+## Add age_groups features to feature matrices.
+agegrps = [0, 125, 250, 375, 500, 625, 750]
+n_agegrps = len(agegrps) - 1
+
+feat_agegrp = np.zeros((n_intervals, n_agegrps))
+for i in range(n_agegrps):
+    feat_agegrp[agegrps[i]:agegrps[i + 1], i] = 1
+
+feat_agegrp = csr_matrix(feat_agegrp)
+features = [hstack([f, feat_agegrp]).tocsr() for f in features]
+censored_features = [hstack([f, feat_agegrp]).tocsr() for f in
+                     censored_features]
+n_lags = np.hstack([n_lags, np.zeros(n_agegrps)])
+
+# Learning
+# Example code for cross validation
+# start = time()
+# learner = ConvSCCS(n_lags=n_lags.astype('uint64'),
+#                    penalized_features=np.arange(n_features),
+#                    random_state=42)
+# C_TV_range = (1, 4)
+# C_L1_range = (2, 5)
+# _, cv_track = learner.fit_kfold_cv(features, labels, censoring,
+#                                    C_TV_range, C_L1_range,
+#                                    confidence_intervals=True,
+#                                    n_samples_bootstrap=20, n_cv_iter=50)
+# elapsed_time = time() - start
+# print("Elapsed time (model training): %.2f seconds \n" % elapsed_time)
+# print("Best model hyper parameters: \n")
+# print("C_tv : %f \n" % cv_track.best_model['C_tv'])
+# print("C_group_l1 : %f \n" % cv_track.best_model['C_group_l1'])
+# cv_track.plot_cv_report(35, 45)
+# plt.show()
+# confidence_intervals = cv_track.best_model['confidence_intervals']
+
+# using the parameters resulting from cross-validation
+learner = ConvSCCS(n_lags=n_lags.astype('uint64'),
+                   penalized_features=np.arange(n_features),
+                   random_state=42, C_tv=270.2722840570933,
+                   C_group_l1=5216.472772625124)
+
+_, confidence_intervals = learner.fit(features, labels, censoring,
+                                      confidence_intervals=True,
+                                      n_samples_bootstrap=20)
+
+# Plot estimated parameters
+# get bootstrap confidence intervals
+refitted_coeffs = confidence_intervals['refit_coeffs']
+lower_bound = confidence_intervals['lower_bound']
+upper_bound = confidence_intervals['upper_bound']
+
+n_rows = int(np.ceil(n_features / 2))
+remove_last_plot = (n_features % 2 != 0)
+
+fig, axarr = plt.subplots(n_rows, 2, sharex=True, sharey=True, figsize=(10, 6))
+y = confidence_intervals['refit_coeffs']
+lb = confidence_intervals['lower_bound']
+ub = confidence_intervals['upper_bound']
+for i, c in enumerate(y[:-6]):
+    ax = axarr[i // 2][i % 2]
+    l = n_lags[i]
+    ax.plot(np.exp(coeffs[i]), label="True RI")
+    ax.step(np.arange(l + 1), np.exp(c), label="Estimated RI")
+    ax.fill_between(np.arange(l + 1), np.exp(lb[i]), np.exp(ub[i]), alpha=.5,
+                    color='orange', step='pre', label="95% boostrap CI")
+plt.suptitle('Estimated relative risks with 95% confidence bands')
+axarr[0][1].legend(loc='best')
+[ax[0].set_ylabel('Relative incidence') for ax in axarr]
+[ax.set_xlabel('Time after exposure start') for ax in axarr[-1]]
+if remove_last_plot:
+    fig.delaxes(axarr[-1][-1])
+plt.show()
+
+normalize = lambda x: x / np.sum(x)
+m = np.repeat(np.hstack(refitted_coeffs[-6:]), 125)
+lb = np.repeat(np.hstack(lower_bound[-6:]), 125)
+ub = np.repeat(np.hstack(upper_bound[-6:]), 125)
+plt.figure()
+plt.plot(np.arange(n_intervals),
+         normalize(np.exp(time_drift(np.arange(n_intervals)))))
+plt.step(np.arange(n_intervals), normalize(np.exp(m)))
+plt.fill_between(np.arange(n_intervals), np.exp(lb) / np.exp(m).sum(),
+                 np.exp(ub) / np.exp(m).sum(), alpha=.5, color='orange',
+                 step='pre')
+plt.xlabel('Age')
+plt.ylabel('Normalized Age Relative Incidence')
+plt.title("Normalized age effect with 95% confidence bands");
+plt.show()
diff --git a/lib/cpp/preprocessing/longitudinal_features_lagger.cpp b/lib/cpp/preprocessing/longitudinal_features_lagger.cpp
@@ -7,15 +7,27 @@
 #include "tick/preprocessing/longitudinal_features_lagger.h"
 
 LongitudinalFeaturesLagger::LongitudinalFeaturesLagger(
-    const SBaseArrayDouble2dPtrList1D &features, const ulong n_lags)
+    const SBaseArrayDouble2dPtrList1D &features,
+    const SArrayULongPtr n_lags)
     : n_intervals(features[0]->n_rows()),
       n_lags(n_lags),
       n_samples(features.size()),
       n_observations(n_samples * n_intervals),
       n_features(features[0]->n_cols()),
-      n_lagged_features(n_features * (n_lags + 1)) {
-  if (n_lags >= n_intervals) {
-    TICK_ERROR("n_lags must be between 0 and (n_intervals - 1)");
+      n_lagged_features(n_lags->sum() + n_lags->size()) {
+  col_offset = ArrayULong(n_lags->size());
+  col_offset.init_to_zero();
+  if (n_features != n_lags->size()) {
+    TICK_ERROR("Features matrix column number should match n_lags length.");
+  }
+  if ((*n_lags)[0] >= n_intervals) {
+    TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
+  }
+  for (ulong i(1); i < n_lags->size(); i++) {
+    if ((*n_lags)[i] >= n_intervals) {
+      TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
+    }
+    col_offset[i] = col_offset[i - 1] + (*n_lags)[i-1] + 1;
   }
 }
 
@@ -24,22 +36,22 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
                                                         ulong censoring) const {
   if (out.n_cols() != n_lagged_features) {
     TICK_ERROR(
-        "n_columns of &out is inconsistent with n_features * (n_lags + 1).");
+        "n_columns of &out should be equal to n_features + sum(n_lags).");
   }
   if (out.n_rows() != n_intervals) {
     TICK_ERROR("n_rows of &out is inconsistent with n_intervals");
   }
-  ulong sample, row, col;
+  ulong n_cols_feature, row, col, max_col;
   double value;
   for (ulong feature = 0; feature < n_features; feature++) {
+    n_cols_feature = (*n_lags)[feature] + 1;
     for (ulong j = 0; j < n_intervals; j++) {
       row = j;
-      sample = row / n_intervals;
-      col = feature * (n_lags + 1);
+      col = col_offset[feature];
       value = features(row, feature);
+      max_col = col + n_cols_feature;
       if (value != 0) {
-        while (row < censoring && row / n_intervals == sample &&
-               col / (n_lags + 1) == feature) {
+        while (row < censoring && col < max_col) {
           out[row * n_lagged_features + col] = value;
           row++;
           col++;
@@ -49,23 +61,30 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
   }
 }
 
-void LongitudinalFeaturesLagger::sparse_lag_preprocessor(
-    ArrayULong &row, ArrayULong &col, ArrayDouble &data, ArrayULong &out_row,
-    ArrayULong &out_col, ArrayDouble &out_data, ulong censoring) const {
-  ulong j = 0;
-  for (ulong i = 0; i < row.size(); i++) {
-    double value = data[i];
-    ulong r = row[i];
-    ulong c = col[i] * (n_lags + 1);
-    ulong sample = r / n_intervals;
-    ulong feature = c / (n_lags + 1);
-    while (r < censoring && r / n_intervals == sample &&
-           c / (n_lags + 1) == feature) {
+void LongitudinalFeaturesLagger::sparse_lag_preprocessor(ArrayULong &row,
+                                                         ArrayULong &col,
+                                                         ArrayDouble &data,
+                                                         ArrayULong &out_row,
+                                                         ArrayULong &out_col,
+                                                         ArrayDouble &out_data,
+                                                         ulong censoring) const {
+  ulong j(0), r, c, offset, new_col, max_col;
+  double value;
+
+  for (ulong i = 0; i < data.size(); i++) {
+    value = data[i];
+    r = row[i];
+    c = col[i];
+    offset = col_offset[c];
+    max_col = offset + (*n_lags)[c] + 1;
+    new_col = offset;
+
+    while (r < censoring && new_col < max_col) {
       out_row[j] = r;
-      out_col[j] = c;
+      out_col[j] = new_col;
       out_data[j] = value;
       r++;
-      c++;
+      new_col++;
       j++;
     }
   }

diff --git a/lib/cpp/survival/model_sccs.cpp b/lib/cpp/survival/model_sccs.cpp
@@ -12,29 +12,32 @@
 
 ModelSCCS::ModelSCCS(const SBaseArrayDouble2dPtrList1D &features,
                      const SArrayIntPtrList1D &labels,
-                     const SBaseArrayULongPtr censoring, ulong n_lags)
-    : n_lags(n_lags), labels(labels), features(features), censoring(censoring) {
-  if (features.size() == 0) TICK_ERROR("ModelSCCS: features empty");
-
-  n_samples = features.size();
-  n_intervals = features[0]->n_rows();
-  n_observations = (n_samples * n_intervals);
-
-  n_lagged_features = features[0]->n_cols();
-  n_features =
-      n_lags > 0 ? n_lagged_features / (n_lags + 1) : n_lagged_features;
-
-  if (n_lags >= n_intervals)
-    TICK_ERROR("ModelSCCS requires n_lags < n_intervals");
+                     const SBaseArrayULongPtr censoring,
+                     const SArrayULongPtr n_lags)
+    : n_intervals(features[0]->n_rows()),
+      n_lags(n_lags),
+      n_samples(features.size()),
+      n_observations(n_samples * n_intervals),
+      n_lagged_features(n_lags->sum() + n_lags->size()),
+      n_features(n_lags->size()),
+      labels(labels),
+      features(features),
+      censoring(censoring) {
+  if ((*n_lags)[0] >= n_intervals) {
+    TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
+  }
+  col_offset = ArrayULong(n_lags->size());
+  col_offset.init_to_zero();
+  for (ulong i(1); i < n_lags->size(); i++) {
+    if ((*n_lags)[i] >= n_intervals) {
+      TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
+    }
+    col_offset[i] = col_offset[i - 1] + (*n_lags)[i-1] + 1;
+  }
 
   if (n_samples != labels.size() || n_samples != (*censoring).size())
     TICK_ERROR("features, labels and censoring should have equal length.");
 
-  if (n_lags > 0 && n_lagged_features % (n_lags + 1) != 0)
-    TICK_ERROR(
-        "n_lags should be a divisor of the number of feature matrices "
-        "columns.");
-
   for (ulong i(0); i < n_samples; i++) {
     if (features[i]->n_rows() != n_intervals)
       TICK_ERROR("All feature matrices should have " << n_intervals << " rows");
@@ -62,7 +65,8 @@ double ModelSCCS::loss_i(const ulong i, const ArrayDouble &coeffs) {
 
   for (ulong t = 0; t < max_interval; t++)
     inner_prod[t] = get_inner_prod(i, t, coeffs);
-  for (ulong t = max_interval; t < n_intervals; t++) inner_prod[t] = 0;
+  if (max_interval < n_intervals)
+    view(inner_prod, max_interval, n_intervals).fill(0);
 
   softMax(inner_prod, softmax);
 
@@ -98,7 +102,7 @@ void ModelSCCS::grad_i(const ulong i, const ArrayDouble &coeffs,
     inner_prod[t] = get_inner_prod(i, t, coeffs);
 
   if (max_interval < n_intervals)
-    view(inner_prod, max_interval, n_intervals).fill(0);  // TODO
+    view(inner_prod, max_interval, n_intervals).fill(0);
 
   double x_max = inner_prod.max();
   double sum_exp = sumExpMinusMax(inner_prod, x_max);

diff --git a/lib/include/tick/preprocessing/longitudinal_features_lagger.h b/lib/include/tick/preprocessing/longitudinal_features_lagger.h
@@ -14,15 +14,16 @@
 class LongitudinalFeaturesLagger {
  protected:
   ulong n_intervals;
-  ulong n_lags;
+  SArrayULongPtr n_lags;
+  ArrayULong col_offset;
   ulong n_samples;
   ulong n_observations;
   ulong n_features;
   ulong n_lagged_features;
 
  public:
   LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features,
-                             const ulong n_lags);
+                             const SArrayULongPtr n_lags);
 
   void dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out,
                               ulong censoring) const;
@@ -36,6 +37,7 @@ class LongitudinalFeaturesLagger {
   void serialize(Archive &ar) {
     ar(CEREAL_NVP(n_intervals));
     ar(CEREAL_NVP(n_lags));
+    ar(CEREAL_NVP(col_offset));
     ar(CEREAL_NVP(n_samples));
     ar(CEREAL_NVP(n_observations));
     ar(CEREAL_NVP(n_features));

diff --git a/lib/include/tick/survival/model_sccs.h b/lib/include/tick/survival/model_sccs.h
@@ -16,7 +16,8 @@ class DLL_PUBLIC ModelSCCS : public ModelLipschitz {
 
  protected:
   ulong n_intervals;
-  ulong n_lags;
+  SArrayULongPtr n_lags;
+  ArrayULong col_offset;
   ulong n_samples;
   ulong n_observations;
   ulong n_lagged_features;
@@ -34,7 +35,8 @@ class DLL_PUBLIC ModelSCCS : public ModelLipschitz {
  public:
   ModelSCCS(const SBaseArrayDouble2dPtrList1D &features,
             const SArrayIntPtrList1D &labels,
-            const SBaseArrayULongPtr censoring, ulong n_lags);
+            const SBaseArrayULongPtr censoring,
+            const SArrayULongPtr n_lags);
 
   double loss(const ArrayDouble &coeffs) override;
 

diff --git a/lib/swig/preprocessing/longitudinal_features_lagger.i b/lib/swig/preprocessing/longitudinal_features_lagger.i
@@ -8,7 +8,7 @@ class LongitudinalFeaturesLagger {
 
  public:
   LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features,
-                             const ulong n_lags);
+                             const SArrayULongPtr n_lags);
 
   void dense_lag_preprocessor(ArrayDouble2d &features,
                               ArrayDouble2d &out,